In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

Load dataset

In [28]:
df = pd.read_csv("breast-cancer.csv")
df.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
340,89813,B,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,...,16.67,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764
388,903011,B,11.27,15.5,73.38,392.0,0.08365,0.1114,0.1007,0.02757,...,12.04,18.93,79.73,450.0,0.1102,0.2809,0.3021,0.08272,0.2157,0.1043
39,855138,M,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,...,15.53,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071
546,922577,B,10.32,16.35,65.31,324.9,0.09434,0.04994,0.01012,0.005495,...,11.25,21.77,71.12,384.9,0.1285,0.08842,0.04384,0.02381,0.2681,0.07399
529,918465,B,12.07,13.44,77.83,445.2,0.11,0.09009,0.03781,0.02798,...,13.45,15.77,86.92,549.9,0.1521,0.1632,0.1622,0.07393,0.2781,0.08052


In [29]:
df['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

 Split features & target

In [30]:
X = df.drop(columns=["diagnosis"])
X.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [31]:
Y = df["diagnosis"]
Y.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: object

Train-Test Split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Balance the dataset

In [33]:
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)



In [34]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [35]:
predictions = model.predict(x_test)
predictions

array(['M', 'M', 'M', 'B', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'B',
       'M', 'B', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'M',
       'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'B', 'M', 'M',
       'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M'], dtype=object)

In [36]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           B       0.96      0.96      0.96        71
           M       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [37]:
confusion_matrix(y_test, predictions)

array([[68,  3],
       [ 3, 40]], dtype=int64)

save the model

In [38]:
joblib.dump(model, 'Lr_breast_cancer_model.pkl')

['Lr_breast_cancer_model.pkl']