In [192]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

Load dataset

In [193]:
df = pd.read_csv("breast-cancer.csv")
df.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
403,9047,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,...,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
147,86973701,B,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,...,16.25,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218
391,903483,B,8.734,16.84,55.27,234.3,0.1039,0.07428,0.0,0.0,...,10.17,22.8,64.01,317.0,0.146,0.131,0.0,0.0,0.2445,0.08865
195,875878,B,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,...,13.88,22.0,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
491,91376702,B,17.85,13.23,114.6,992.1,0.07838,0.06217,0.04445,0.04178,...,19.82,18.42,127.1,1210.0,0.09862,0.09976,0.1048,0.08341,0.1783,0.05871


In [194]:
df['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

Split features & target

In [195]:
X = df.drop(columns=["diagnosis"])
X.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [196]:
Y = df["diagnosis"]
Y.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: object

Train-Test Split

In [197]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [198]:
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)



Train the model

In [199]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)

In [200]:
predictions = model.predict(x_test)
predictions

array(['M', 'M', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M',
       'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M',
       'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'M',
       'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'M', 'M',
       'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M'], dtype=object)

In [201]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           B       0.83      0.76      0.79        71
           M       0.65      0.74      0.70        43

    accuracy                           0.75       114
   macro avg       0.74      0.75      0.74       114
weighted avg       0.76      0.75      0.76       114



In [202]:
accuracy_score(y_test, predictions)

0.7543859649122807

In [203]:
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[54 17]
 [11 32]]


save the model

In [204]:
joblib.dump(model, 'knn_breast_cancer_model.pkl')

['knn_breast_cancer_model.pkl']