Train and Validate Model for Breast Cancer Diagnosis using K-n.

The dataset used for this analysis is the Breast Cancer Wisconsin (Diagnostic) dataset, which can be found at the following link:
[Breast Cancer Wisconsin (Diagnostic) Data Set](http://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic)

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [None]:
# data = pd.read_csv('dataSetE4/wdbc.data')
# print(data)

In [2]:
columnNames = [
    'ID', 'Diagnosis', 'Radius Mean', 'Texture Mean', 'Perimeter Mean', 'Area Mean', 'Smoothness Mean', 
    'Compactness Mean', 'Concavity Mean', 'Concave Points Mean', 'Symmetry Mean', 'Fractal Dimension Mean', 
    'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE', 'Compactness SE', 'Concavity SE', 
    'Concave Points SE', 'Symmetry SE', 'Fractal Dimension SE', 'Radius Worst', 'Texture Worst', 
    'Perimeter Worst', 'Area Worst', 'Smoothness Worst', 'Compactness Worst', 'Concavity Worst', 
    'Concave Points Worst', 'Symmetry Worst', 'Fractal Dimension Worst'
]

data = pd.read_csv('dataSetE4/wdbc.data', header=None, names=columnNames)

In [3]:
data.drop('ID', axis=1, inplace=True)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0}) # Malignant = 1, Benign = 0

X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
data.drop('ID', axis=1, inplace=True)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})  # Malignant = 1, Benign = 0

X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42, stratify=y)

print("Explained variance by each principal component:\n", pca.explained_variance_ratio_)
print("Number of principal components selected:", pca.n_components_)

## Fit

In [4]:
np.seterr(invalid='ignore') 
paramGrid = {
    'n_neighbors': np.arange(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

gridSearch = GridSearchCV(KNeighborsClassifier(), paramGrid, cv=5, scoring='accuracy')
gridSearch.fit(X_train, y_train)

bestParams = gridSearch.best_params_
print(f'{bestParams}')

bestKnnModel = KNeighborsClassifier(**bestParams)
bestKnnModel.fit(X_train, y_train)

yPredBest = bestKnnModel.predict(X_test)
accuracyBest = accuracy_score(y_test, yPredBest)
classificationRepBest = classification_report(y_test, yPredBest, target_names=['Benign', 'Malignant'])

print(f'\nEvaluation of the Best k-NN Model')
print('\nClassification Report:\n', classificationRepBest)
print(f'\nAccuracy: {accuracyBest:.4f}')

{'metric': 'manhattan', 'n_neighbors': np.int64(5), 'weights': 'uniform'}

Evaluation of the Best k-NN Model

Classification Report:
               precision    recall  f1-score   support

      Benign       0.96      1.00      0.98       107
   Malignant       1.00      0.92      0.96        64

    accuracy                           0.97       171
   macro avg       0.98      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171


Accuracy: 0.9708
