In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [2]:
import os

local_path = 'dataset/breast_cancer.csv'

if os.path.exists(local_path):
    df = pd.read_csv(local_path)
else:

    try:
        from google.colab import files
        print("Please upload 'breast_cancer.csv' when the file picker appears.")
        uploaded = files.upload()
        uploaded_fname = list(uploaded.keys())[0]
        df = pd.read_csv(uploaded_fname)
    except Exception:
        raise FileNotFoundError("Dataset not found locally and Colab upload not available. "
                                "Place 'breast_cancer.csv' in dataset/ or upload in Colab.")

print("Loaded dataset with shape:", df.shape)

df.head()


Please upload 'breast_cancer.csv' when the file picker appears.


Saving Dataset.csv to Dataset.csv
Loaded dataset with shape: (569, 33)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
print(df.shape)
print(df.columns)
print(df.isnull().sum().sum())
print(df['diagnosis'].unique())


(569, 33)
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
569
['M' 'B']


In [4]:
df = df.copy()

cols_to_drop = [c for c in ['id', 'Unnamed: 32'] if c in df.columns]
df.drop(columns=cols_to_drop, inplace=True)

df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_scaled.head()


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((455, 30), (114, 30))

In [6]:
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

cluster_df = X_scaled.copy()
cluster_df['diagnosis'] = y.values
cluster_df['cluster'] = cluster_labels

ct = pd.crosstab(cluster_df['diagnosis'], cluster_df['cluster'])
ct


cluster,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,348
1,180,32


In [7]:
cluster_map = {}
for cl in np.unique(cluster_labels):
    majority = cluster_df[cluster_df['cluster'] == cl]['diagnosis'].mode()[0]
    cluster_map[cl] = majority

pred_cluster = [cluster_map[c] for c in cluster_labels]
cluster_acc = np.mean(pred_cluster == cluster_df['diagnosis'])

cluster_acc


np.float64(0.9279437609841827)

In [8]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)


In [9]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

acc, prec, rec, f1, cm


(0.9649122807017544,
 1.0,
 0.9047619047619048,
 0.95,
 array([[72,  0],
        [ 4, 38]]))

In [10]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97        72
           1       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [11]:
report = (
    f"Accuracy: {acc:.4f}\n"
    f"Precision: {prec:.4f}\n"
    f"Recall: {rec:.4f}\n"
    f"F1-score: {f1:.4f}\n"
    f"Confusion Matrix:\n{cm}\n"
    f"K-Means Approx Accuracy: {cluster_acc:.4f}\n"
)

with open("evaluation_report.txt", "w") as f:
    f.write(report)

report


'Accuracy: 0.9649\nPrecision: 1.0000\nRecall: 0.9048\nF1-score: 0.9500\nConfusion Matrix:\n[[72  0]\n [ 4 38]]\nK-Means Approx Accuracy: 0.9279\n'