In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('data.csv')

In [4]:
# Drop useless columns
df = df.drop(columns=["id", "Unnamed: 32"])

# Encode target
df["diagnosis"] = df["diagnosis"].map({"B": 0, "M": 1})

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])


In [7]:
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11, 15],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan", "minkowski"]
}


In [8]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best ROC AUC:", grid.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Best ROC AUC: 0.9912796697626419


In [9]:
best_knn = grid.best_estimator_

# Predictions
y_pred = best_knn.predict(X_test)
y_proba = best_knn.predict_proba(X_test)[:, 1]


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


Accuracy: 0.956140350877193

Confusion Matrix:
[[71  1]
 [ 4 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97        72
           1       0.97      0.90      0.94        42

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

ROC AUC Score: 0.9976851851851851


In [12]:
import pickle
pickle.dump(best_knn, open('breast_cancer_model.pkl','wb'))

In [13]:
model = pickle.load(open('breast_cancer_model.pkl','rb'))

In [14]:
test_input_benign = np.array([[
    12.5, 14.3, 80.5, 480.0, 0.085, 0.09, 0.04, 0.02, 0.16, 0.060,
    0.25, 1.2, 1.7, 20.0, 300.0, 0.095, 0.10, 0.05, 0.03, 0.19,
    0.065, 13.0, 16.0, 85.0, 510.0, 0.10, 0.12, 0.06, 0.03, 0.21
]]).reshape(1,30)

In [15]:
model.predict(test_input_benign)



array([0])

In [16]:
test_input_malignant = np.array([[
    18.7, 21.5, 120.4, 1080.0, 0.145, 0.22, 0.30, 0.15, 0.35, 0.090,
    0.45, 2.2, 3.5, 35.0, 1200.0, 0.155, 0.25, 0.32, 0.18, 0.42,
    0.110, 20.0, 28.0, 132.0, 1500.0, 0.17, 0.28, 0.35, 0.20, 0.48
]]).reshape(1,30)


In [17]:
model.predict(test_input_malignant)



array([0])