<a href="https://colab.research.google.com/github/Otobi1/Predictors-of-Breast-Cancer-Recurrence/blob/master/Predictors_of_Cancer_Recurrence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data

--2021-04-22 18:16:44--  http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18654 (18K) [application/x-httpd-php]
Saving to: ‘breast-cancer.data.2’


2021-04-22 18:16:44 (286 KB/s) - ‘breast-cancer.data.2’ saved [18654/18654]



In [3]:
df = pd.read_csv('breast-cancer.data', sep=',', names=['RecClass', 'Age', 'Menopause',
                                                       'TumorSize', 'InvNodes', 'NodeCaps',
                                                       'DegMalig', 'Breast', 'Quadrant', 'Radiation']) 

# Shuffle data 
df = df.sample(frac = 1).reset_index(drop = True) 

# Make a copy of the data
data = df.copy()

In [4]:
data.head()

Unnamed: 0,RecClass,Age,Menopause,TumorSize,InvNodes,NodeCaps,DegMalig,Breast,Quadrant,Radiation
0,no-recurrence-events,30-39,premeno,20-24,0-2,no,3,left,central,no
1,recurrence-events,60-69,ge40,50-54,0-2,no,3,right,left_up,no
2,no-recurrence-events,60-69,ge40,10-14,0-2,no,1,right,left_low,no
3,recurrence-events,30-39,premeno,40-44,0-2,no,1,left,left_up,no
4,no-recurrence-events,50-59,ge40,15-19,0-2,no,1,right,central,no


In [5]:
data.shape

(286, 10)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   RecClass   286 non-null    object
 1   Age        286 non-null    object
 2   Menopause  286 non-null    object
 3   TumorSize  286 non-null    object
 4   InvNodes   286 non-null    object
 5   NodeCaps   286 non-null    object
 6   DegMalig   286 non-null    int64 
 7   Breast     286 non-null    object
 8   Quadrant   286 non-null    object
 9   Radiation  286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [7]:
data.keys()

Index(['RecClass', 'Age', 'Menopause', 'TumorSize', 'InvNodes', 'NodeCaps',
       'DegMalig', 'Breast', 'Quadrant', 'Radiation'],
      dtype='object')

In [8]:
data['Age'].unique()

array(['30-39', '60-69', '50-59', '40-49', '20-29', '70-79'], dtype=object)

In [9]:
data['Breast'].unique()

array(['left', 'right'], dtype=object)

In [10]:
data['DegMalig'].unique()

array([3, 1, 2])

In [11]:
data['InvNodes'].unique()

array(['0-2', '6-8', '3-5', '24-26', '15-17', '9-11', '12-14'],
      dtype=object)

In [12]:
data['Menopause'].unique()

array(['premeno', 'ge40', 'lt40'], dtype=object)

In [13]:
data['NodeCaps'].unique()

array(['no', 'yes', '?'], dtype=object)

In [14]:
data['Quadrant'].unique()

array(['central', 'left_up', 'left_low', 'right_up', '?', 'right_low'],
      dtype=object)

In [15]:
data['Radiation'].unique()

array(['no', 'yes'], dtype=object)

In [16]:
data['TumorSize'].unique()

array(['20-24', '50-54', '10-14', '40-44', '15-19', '25-29', '35-39',
       '30-34', '5-9', '0-4', '45-49'], dtype=object)

In [17]:
data['RecClass'].unique()

array(['no-recurrence-events', 'recurrence-events'], dtype=object)

In [18]:
data.keys()

Index(['RecClass', 'Age', 'Menopause', 'TumorSize', 'InvNodes', 'NodeCaps',
       'DegMalig', 'Breast', 'Quadrant', 'Radiation'],
      dtype='object')

In [19]:
# Converting the Age group variable into dummy variables and dropping the first column of the AgeGroup category

# Original unique Age group categories '60-69', '40-49', '30-39', '50-59', '70-79', '20-29' (20 - 29 AgeGroup is dropped to prevent Multicollinearity)

Age = pd.get_dummies(data['Age'], drop_first=True)

data = data.drop('Age', axis=1)

Age = Age.add_prefix('AgeGroup ')

data = pd.concat([data, Age], axis=1)

In [20]:
# Converting the Menopause variable into dummy variables and dropping the first column of the Menopause category

# Original unique Menopause categories 'ge40', 'premeno', 'lt40' (ge40 gets dropped)

Menopause = pd.get_dummies(data['Menopause'], drop_first=True)

data = data.drop('Menopause', axis=1)

Menopause = Menopause.add_prefix('Menopause ')

data = pd.concat([data, Menopause], axis=1)

In [21]:
# Converting the TumorSize variables into dummies and droping the first column 

# Original unique TumorSize categories '30-34', '15-19', '25-29', '20-24', '0-4', '35-39', '10-14','40-44', '50-54', '45-49', '5-9' (0-4 dropped)

TumorSize = pd.get_dummies(data['TumorSize'], drop_first=True)

data = data.drop('TumorSize', axis=1)

TumorSize = TumorSize.add_prefix('TumorSize ')

data = pd.concat([data, TumorSize], axis=1)

In [22]:
# Converting the InvNodes variables into dummies and droping the first column 

# Original unique InvNodes categories '0-2', '12-14', '3-5', '15-17', '6-8', '9-11', '24-26' (0-2 dropped)

InvNodes = pd.get_dummies(data['InvNodes'], drop_first=True)

data = data.drop('InvNodes', axis=1)

InvNodes = InvNodes.add_prefix('InvNodes ')

data = pd.concat([data, InvNodes], axis=1)

In [23]:
# Converting the NodeCaps variables into dummies and droping the first column 

# Original unique NodeCaps categories 'no', 'yes', '? (freaking ? dropped, nice)

NodeCaps = pd.get_dummies(data['NodeCaps'], drop_first=True)

data = data.drop('NodeCaps', axis=1)

NodeCaps = NodeCaps.add_prefix('NodeCaps ')

data = pd.concat([data, NodeCaps], axis=1)

In [24]:
# Converting the Breast variables into dummies and droping the first column 

# Original unique Breast categories 'left', 'right', (left dropped)

Breast = pd.get_dummies(data['Breast'], drop_first=True)

data = data.drop('Breast', axis=1)

Breast = Breast.add_prefix('Breast ')

data = pd.concat([data, Breast], axis=1)

In [25]:
# Converting the Quadrant variables into dummies and droping the first column 

# Original unique Quadrant categories 'left_low', 'right_low', 'central', 'left_up', 'right_up', '?' (freaking ? dropped again, nice)

Quadrant = pd.get_dummies(data['Quadrant'], drop_first=True)

data = data.drop('Quadrant', axis=1)

Quadrant = Quadrant.add_prefix('Quadrant ')

data = pd.concat([data, Quadrant], axis=1)

In [26]:
# Converting the Radiation variables into dummies and droping the first column 

# Original unique Radiation categories 'no', 'yes' (no dropped)

Radiation = pd.get_dummies(data['Radiation'], drop_first=True)

data = data.drop('Radiation', axis=1)

Radiation = Radiation.add_prefix('Radiation ')

data = pd.concat([data, Radiation], axis=1)

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   RecClass            286 non-null    object
 1   DegMalig            286 non-null    int64 
 2   AgeGroup 30-39      286 non-null    uint8 
 3   AgeGroup 40-49      286 non-null    uint8 
 4   AgeGroup 50-59      286 non-null    uint8 
 5   AgeGroup 60-69      286 non-null    uint8 
 6   AgeGroup 70-79      286 non-null    uint8 
 7   Menopause lt40      286 non-null    uint8 
 8   Menopause premeno   286 non-null    uint8 
 9   TumorSize 10-14     286 non-null    uint8 
 10  TumorSize 15-19     286 non-null    uint8 
 11  TumorSize 20-24     286 non-null    uint8 
 12  TumorSize 25-29     286 non-null    uint8 
 13  TumorSize 30-34     286 non-null    uint8 
 14  TumorSize 35-39     286 non-null    uint8 
 15  TumorSize 40-44     286 non-null    uint8 
 16  TumorSize 45-49     286 no

In [28]:
data.shape

(286, 34)

In [31]:
X = data._get_numeric_data()
y = data['RecClass']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 1919)

In [32]:
# Base Model 

y.value_counts(normalize=True)

# A base model that predicts no recurrence events would be correct 70% of the time

no-recurrence-events    0.702797
recurrence-events       0.297203
Name: RecClass, dtype: float64

In [34]:
pipe = Pipeline(steps=[('lr', LogisticRegression())]) # Running a pipeline of logistic regression 

params = {'lr__penalty': ['l1'], 
          'lr__C': [1], 
          'lr__solver': ['liblinear']} # setting parameters

gs_lr = GridSearchCV(pipe, param_grid=params, cv=5,scoring='accuracy', n_jobs=-2)

gs_lr.fit(X_train, y_train)
gs_lr.best_estimator_

Pipeline(memory=None,
         steps=[('lr',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [35]:
# Scoring the model 

gs_lr.score(X_train, y_train)

0.780373831775701

In [36]:
# Cross validation scoring 

cross_val_score(gs_lr.best_estimator_, X, y, cv=5).mean()

0.709739866908651

In [37]:
# Scoring the test data

gs_lr.score(X_test, y_test)

# Inference: this model performs worse than the base model

0.6388888888888888

In [38]:
pipe = Pipeline(steps=[('sc', StandardScaler()), ('knn', KNeighborsClassifier())])

params = {'knn__n_neighbors': [21], 'knn__p': [1]}

gs_knn = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy')

gs_knn.fit(X_train, y_train)
gs_knn.best_estimator_

Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=21, p=1,
                                      weights='uniform'))],
         verbose=False)

In [39]:
# Scoring the model 

gs_knn.score(X_train, y_train)

0.7476635514018691

In [40]:
# Cross validation scoring 

cross_val_score(gs_knn.best_estimator_, X, y, cv=5).mean()

0.7026618269812462

In [42]:
# Scoring the test data

gs_knn.score(X_test, y_test)

# Inference: a bit worse than the base model

0.6944444444444444

In [None]:
# Decision Tree Classifer