**Breast Cancer prediction using ML Approach on Wisconsin**

In [19]:
# Importing Libraries

import pandas as pd
import numpy as np

df = pd.read_csv("Breast_Cancer.csv")
df
df.info()

# df.info() is used to get all info about the data, we have diagnosis as TARGET and we are dropping ID

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [20]:
# Data Cleaning

df.drop(df.columns[[-1, 0]], axis = 1, inplace = True)
df
df['diagnosis'].value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,357
M,212


In [21]:
# Feature Selection

from sklearn.model_selection import train_test_split

diag_map = {'M': 1, 'B': 0}
df['diagnosis'] = df['diagnosis'].map(diag_map) # We don't have to use OneHotEncoder here as there is only M and B two values so we just use map to change them into 0 and 1

X = df[['radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean']] # These are the 5 features (Columns) that I am considering to use for prediction

y = df[['diagnosis']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
X

Unnamed: 0,radius_mean,perimeter_mean,area_mean,concavity_mean,concave points_mean
0,17.99,122.80,1001.0,0.30010,0.14710
1,20.57,132.90,1326.0,0.08690,0.07017
2,19.69,130.00,1203.0,0.19740,0.12790
3,11.42,77.58,386.1,0.24140,0.10520
4,20.29,135.10,1297.0,0.19800,0.10430
...,...,...,...,...,...
564,21.56,142.00,1479.0,0.24390,0.13890
565,20.13,131.20,1261.0,0.14400,0.09791
566,16.60,108.30,858.1,0.09251,0.05302
567,20.60,140.10,1265.0,0.35140,0.15200


In [23]:
y

Unnamed: 0,diagnosis
0,1
1,1
2,1
3,1
4,1
...,...
564,1
565,1
566,1
567,1


In [24]:
# Model - KNN

import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(knn_y_pred, y_test) # we get an accuracy of 92%

0.9210526315789473

In [25]:
# Few Complicated Stuff

new_df = df.drop(df.columns[[0]], axis = 1, inplace = True) # Dropping Diagnosis
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state = 42) # Selecting all features not just 5, we get an accuracy of 95%

In [26]:
# Model - Logistic Regression

from sklearn.linear_model import LogisticRegression
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
accuracy_score(lr_y_pred, y_test) # 96% accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9649122807017544

In [27]:
# Model - Naive Bayes

from sklearn.naive_bayes import GaussianNB
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_y_pred = gnb.predict(X_test)
accuracy_score(gnb_y_pred, y_test) # 97% accuracy

0.9736842105263158

In [28]:
# Model - K-Cross Validation (Comparing the models you trained)

from sklearn.model_selection import cross_val_score
accuracy_all =  []
cvs_all = []

In [29]:
# For KNN

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
scores = cross_val_score(knn, X, y, cv = 10)
accuracy_all.append(accuracy_score(knn_y_pred, y_test))
cvs_all.append(np.mean(scores))
print("Accuracy: {0: .2%}".format(accuracy_score(knn_y_pred, y_test)))
print("Cross Validation Score: {0: .2%}(+/- {1:2%})".format(np.mean(scores),np.std(scores)*2))

Accuracy:  92.11%
Cross Validation Score:  88.23%(+/- 7.664790%)


In [30]:
# For Logistic Regression

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
scores = cross_val_score(lr, X, y, cv = 10)
accuracy_all.append(accuracy_score(lr_y_pred, y_test))
cvs_all.append(np.mean(scores))
print("Accuracy: {0: .2%}".format(accuracy_score(lr_y_pred, y_test)))
print("Cross Validation Score: {0: .2%}(+/- {1:2%})".format(np.mean(scores),np.std(scores)*2))

Accuracy:  96.49%
Cross Validation Score:  89.46%(+/- 8.429738%)
