# Predicting Breast Cancer diagnosis

### Data Loading

In [18]:
import pandas as pd
import numpy as np

columns = [
    'ID', 'Diagnosis',
    'Mean Radius', 'Mean Texture', 'Mean Perimeter', 'Mean Area', 'Mean Smoothness',
    'Mean Compactness', 'Mean Concavity', 'Mean Concave Points', 'Mean Symmetry', 'Mean Fractal Dimension',
    'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE',
    'Compactness SE', 'Concavity SE', 'Concave Points SE', 'Symmetry SE', 'Fractal Dimension SE',
    'Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness',
    'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension'
]

wdbc_data = pd.read_csv('data/wdbc.data', header=None, names=columns)
wdbc_data.head()


Unnamed: 0,ID,Diagnosis,Mean Radius,Mean Texture,Mean Perimeter,Mean Area,Mean Smoothness,Mean Compactness,Mean Concavity,Mean Concave Points,...,Worst Radius,Worst Texture,Worst Perimeter,Worst Area,Worst Smoothness,Worst Compactness,Worst Concavity,Worst Concave Points,Worst Symmetry,Worst Fractal Dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Data Preprocessing

In [19]:
from sklearn.preprocessing import StandardScaler

# Dropping the ID column as it doesn't contribute to the model
wdbc_data = wdbc_data.drop(columns=['ID'])

# Encoding the diagnosis column into categorical values (keeping 'B' and 'M')
# wdbc_data['Diagnosis'] = wdbc_data['Diagnosis'].map({'B': 'Benign', 'M': 'Malignant'})

# Splitting features (X) and target (y)
X = wdbc_data.iloc[:, 1:]  # all columns except Diagnosis
y = wdbc_data['Diagnosis']  # target column

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


### Test Train Split

In [20]:
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Handling missing values in y_train (if any)
y_train.fillna(y_train.mode()[0], inplace=True)  # Replace NaNs with the most frequent value


## filling missing values in y_test

In [21]:
# from sklearn.impute import SimpleImputer

# # Check if there are any missing values in y_test
# print("Missing values in y_test before imputation:", np.isnan(y_test).sum())

# # Filling missing values in y_test using SimpleImputer (most frequent value strategy)
# y_test_reshaped = y_test.to_numpy().reshape(-1, 1)
# imputer = SimpleImputer(strategy='most_frequent')
# y_test_imputed = imputer.fit_transform(y_test_reshaped).ravel()

# # Reassign imputed values back to y_test
# y_test = y_test_imputed

# print("Missing values in y_test after imputation:", np.isnan(y_test).sum())


## Train the GNB Classifier

In [22]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)


### Make Predictions

In [23]:
y_pred = gnb.predict(X_test)
y_pred

array(['B', 'M', 'M', 'B', 'B', 'M', 'M', 'M', 'M', 'B', 'B', 'M', 'B',
       'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'M',
       'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'B', 'M', 'M',
       'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M'], dtype='<U1')

## Evaluate the model

In [24]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\n Classification Report: \n",)
print("classification_report: ", classification_report(y_test, y_pred))

Accuracy:  0.9649122807017544

 Classification Report: 

classification_report:                precision    recall  f1-score   support

           B       0.96      0.99      0.97        71
           M       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

