In [72]:
import kagglehub
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

### Data Loading

In [32]:
breast = load_breast_cancer()
df_data = pd.DataFrame(breast.data, columns=breast.feature_names)
df_data.insert(0,"target", breast.target)
df_data.head()

Unnamed: 0,target,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [33]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   target                   569 non-null    int64  
 1   mean radius              569 non-null    float64
 2   mean texture             569 non-null    float64
 3   mean perimeter           569 non-null    float64
 4   mean area                569 non-null    float64
 5   mean smoothness          569 non-null    float64
 6   mean compactness         569 non-null    float64
 7   mean concavity           569 non-null    float64
 8   mean concave points      569 non-null    float64
 9   mean symmetry            569 non-null    float64
 10  mean fractal dimension   569 non-null    float64
 11  radius error             569 non-null    float64
 12  texture error            569 non-null    float64
 13  perimeter error          569 non-null    float64
 14  area error               5

In [41]:
# Visualisation
fig = px.scatter(df_data, "target", "mean perimeter",
           color=df_data["target"].astype(str),
           color_discrete_map={"1": "blue", "0": "red"})
fig.show()

In [44]:
fig = px.scatter(df_data, "mean perimeter", "mean compactness", color=df_data["target"].astype(str),
                 color_discrete_map={"1": "blue", "0": "red"})
fig.show()

### Fitting Logistic Regression

We want to fit the follwing model

$$ \text{log}(\frac{y_i}{1-y_i}) = \theta_0 + \theta_1 x_{1, i} + \theta_{2, i}$$ 

$$ \iff y_i = \frac{1}{1 + \text{exp}(-\theta^T x)} $$

In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(df_data[["mean perimeter", "mean compactness"]], df_data["target"])
model = LogisticRegression().fit(X_train, Y_train)
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {model.coef_}")

Intercept: [14.84077511]
Coefficients: [[-0.15276437 -1.60152868]]


In [None]:
# Visualisation of the results
fig = go.Figure()
colors = Y_train.map({0: "red", 1: "blue"})
fig.add_trace(
    go.Scatter(
        x=X_train["mean perimeter"],
        y=X_train["mean compactness"],
        mode="markers",
        marker=dict(color=colors),
        name="Data"
    )
)

# Adding the separation boundary
theta0 = model.intercept_
theta1, theta2 = model.coef_[0]
x1_values = np.linspace(X_train["mean perimeter"].min(), X_train["mean perimeter"].max(), 100)
func = lambda x: (-theta0 - theta1 * x) / theta2
x2_values = func(x1_values)
arr_plot = np.array(list(zip(x1_values, x2_values)))
arr_plot = arr_plot[(arr_plot[:, 1] >= X_train["mean compactness"].min() - 0.1) & (arr_plot[:, 1] <= X_train["mean compactness"].max() + 0.1)]
fig.add_trace(
    go.Scatter(
        x=arr_plot[:, 0],
        y=arr_plot[:, 1],
        mode='lines',
        name="Separation Boundary",
        line=dict(color="black", width=1, dash="dash")
    )
)
fig.update_layout(
    title_text="Separation Boundary on Training Set"
)

In [125]:
# Adding the separation boundary
theta0 = model.intercept_
theta1, theta2 = model.coef_[0]
x1_values = np.linspace(X_train["mean perimeter"].min(), X_train["mean perimeter"].max(), 100)
func = lambda x: (-theta0 - theta1 * x) / theta2
x2_values = func(x1_values)
arr_plot = np.array(list(zip(x1_values, x2_values)))
arr_plot = arr_plot[(arr_plot[:, 1] >= X_train["mean compactness"].min() - 0.1) & (arr_plot[:, 1] <= X_train["mean compactness"].max() + 0.1)]

In [121]:
arr_plot[1, :]

array([45.25171717,  4.06176494])

In [111]:
test

array([[ 4.37900000e+01,  4.52517172e+01,  4.67134343e+01,
         4.81751515e+01,  4.96368687e+01,  5.10985859e+01,
         5.25603030e+01,  5.40220202e+01,  5.54837374e+01,
         5.69454545e+01,  5.84071717e+01,  5.98688889e+01,
         6.13306061e+01,  6.27923232e+01,  6.42540404e+01,
         6.57157576e+01,  6.71774747e+01,  6.86391919e+01,
         7.01009091e+01,  7.15626263e+01,  7.30243434e+01,
         7.44860606e+01,  7.59477778e+01,  7.74094949e+01,
         7.88712121e+01,  8.03329293e+01,  8.17946465e+01,
         8.32563636e+01,  8.47180808e+01,  8.61797980e+01,
         8.76415152e+01,  8.91032323e+01,  9.05649495e+01,
         9.20266667e+01,  9.34883838e+01,  9.49501010e+01,
         9.64118182e+01,  9.78735354e+01,  9.93352525e+01,
         1.00796970e+02,  1.02258687e+02,  1.03720404e+02,
         1.05182121e+02,  1.06643838e+02,  1.08105556e+02,
         1.09567273e+02,  1.11028990e+02,  1.12490707e+02,
         1.13952424e+02,  1.15414141e+02,  1.16875859e+0

In [76]:
# Assuming df_data is your DataFrame with "mean perimeter", "mean compactness", and "target"
X = df_data[["mean perimeter", "mean compactness"]]
y = df_data["target"]

# Train the logistic regression model
model = LogisticRegression()
model.fit(X, y)

# Get the coefficients and intercept from the model
w1, w2 = model.coef_[0]  # coefficients for "mean perimeter" and "mean compactness"
b = model.intercept_[0]  # intercept

# Create a function to calculate the decision boundary
def decision_boundary(x1):
    return - (w1 * x1 + b) / w2

# Generate x1 values for the decision boundary
x1_values = np.linspace(X["mean perimeter"].min(), X["mean perimeter"].max(), 100)

# Calculate the corresponding x2 values for the decision boundary
x2_values = decision_boundary(x1_values)

# Assign colors manually: Red for target 0 and Blue for target 1
colors = df_data["target"].map({0: "red", 1: "blue"})

# Create the scatter plot
fig = go.Figure()

# Add scatter points (data points)
fig.add_trace(go.Scatter(x=df_data["mean perimeter"], y=df_data["mean compactness"],
                         mode="markers", 
                         marker=dict(color=colors),
                         name="Data"))

# Add decision boundary line (as a line plot)
fig.add_trace(go.Scatter(x=x1_values, y=x2_values,
                         mode="lines", line=dict(color="black", width=2, dash="dash"),
                         name="Decision Boundary"))

# Update layout
fig.update_layout(title="Logistic Regression Decision Boundary",
                  xaxis_title="Mean Perimeter",
                  yaxis_title="Mean Compactness",
                  showlegend=True)

# Show the plot
fig.show()