<a href="https://colab.research.google.com/github/Samboja651/Portfolio/blob/main/comp_460_ML_breast_cancer_wisconsin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

COMP 460 Machine Learning

19-11-2024\
**Breast Cancer Detection - ML Model**\
[dataset link](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)\
[Link to guiding research paper](https://doi.org/10.3390/diagnostics13193113)\
[Link to our documentation](https://docs.google.com/document/d/1TjcojrOeiRrUME_3lfcs7oW_EX08knQVnIqGkXdo97c/edit?usp=sharing)\
Alternatively, all the files are uploaded in this colab, see in file's section.\

Team \
Isaack Leshan       **IN13/00112/21** \
Granton Waribe      **IN13/00036/21** \
Ezekiah Nyagwaya    **IN13/00037/21**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
# Load dataset

In [None]:
df = pd.read_csv('/content/Breast Cancer Wisconsin.csv') # df -> dataframe
df.head()

In [None]:
# preprocessing - drop id and unnamed column
df.info()

In [None]:
df1= df.drop(['id','Unnamed: 32'], axis=1)

In [None]:
df1.shape

(569, 31)

In [None]:
df1.info()

In [None]:
# feature extraction

In [None]:
target_distribution = df1['diagnosis'].value_counts()
print(target_distribution)
plt.figure(figsize=(4, 4))
plt.pie(target_distribution, labels=target_distribution.index, autopct='%1.1f%%', startangle=90)
plt.title("Distribution of Target Feature")
plt.axis('equal')
plt.show()

In [None]:
# Assuming 'df' is your DataFrame and 'column' is the categorical variable
df1['diagnosis'] = df1['diagnosis'].map({'B': 0, 'M': 1})


In [None]:
df1['diagnosis'].value_counts()
# target_distribution = df1['diagnosis'].value_counts()
# print(target_distribution)

In [None]:
# correlation matrix

In [None]:
corr_matrix = df1.corr(numeric_only=True)
plt.figure(figsize=(17, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Features", size=15)
plt.show()

In [None]:
# There is very high collinearity - some features are closely related or affect
# target feature diagnosis

In [None]:
corr_vector = df1.corr(numeric_only=True)['diagnosis'].sort_values(ascending=False)
corr_vector.plot(kind='barh', figsize=(10, 6), color='skyblue')
plt.title(f'Correlation of diagnosis with other Features')
plt.ylabel('Correlation')
plt.xlabel('Features')
plt.show()

In [None]:
# data balancing

In [None]:
X = df1.drop('diagnosis', axis=1)  # Drop the target column
y = df1['diagnosis']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)


In [None]:
X_scaled_df

In [None]:
pca = PCA(n_components=10)
pca.fit(X_scaled_df)
x_pca = pca.transform(X_scaled_df)

In [None]:
print("\nCumulative Explained Variance:")
print(np.cumsum(pca.explained_variance_ratio_))

In [None]:
x_pca.shape

In [None]:
# data splitting test data = 20%

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_pca,y,
                                                    test_size=0.20)

In [None]:
# model selection

In [None]:
accuracy_rate = []

# Will take some time
for i in range(1,50): # k-nn k is selected k=i, adjust k = hyperparmeter tuning

    knn = KNeighborsClassifier(n_neighbors=i)
    score=cross_val_score(knn,x_pca,y,cv=10)
    accuracy_rate.append(score.mean())

In [None]:
plt.figure(figsize=(10,6))
#plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
  #       markerfacecolor='red', markersize=10)
plt.plot(range(1,50),accuracy_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test,pred))
