In [2]:
import numpy as np #linear algebra
import pandas as pd #data processing

import matplotlib.pyplot as plt #data visualization
import seaborn as sns #data visualization

import warnings
warnings.filterwarnings("ignore") #to ignore the warnings

#for model building
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


In [12]:
# Loading the data
breast_cancer= pd.read_csv('C:\\Users\\abhip\\Desktop\\breast-cancer_csv.csv')

In [13]:
breast_cancer.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


In [14]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumor-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    int64 
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   Class        286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [15]:
#Cleanning data set and removing empty cells and duplicate rows as well
breast_cancer.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
281    False
282    False
283    False
284    False
285    False
Length: 286, dtype: bool

In [16]:
breast_cancer.drop_duplicates(inplace = True)

In [17]:
breast_cancer['node-caps'] = breast_cancer['node-caps'].map({'yes': 1, 'no' : 0})
breast_cancer['irradiat'] = breast_cancer['irradiat'].map({'yes': 1, 'no' : 0})
breast_cancer['breast'] = breast_cancer['breast'].map({'right': 1, 'left' : 0})

# Define a dictionary mapping age range to integers
age_range_mapping = {'20-29': 1, '30-39': 2, '40-49': 3, '50-59': 4,'59+' : 5}
# Change age column into map age range to integers
breast_cancer['age'] = breast_cancer.age.map(age_range_mapping)
breast_cancer

# change menopause column into map age range to integers
menopause_mapping = {'premeno' : 1, 'ge40' : 2, 'lt40' : 3}
breast_cancer['menopause'] = breast_cancer.menopause.map(menopause_mapping) 


# Define a dictionary mapping tumor-size range to integers
tumor_range_mapping = {'15-19' :4, '35-39' : 8, '30-34' : 7, '25-29' : 6, '40-44' : 9, '10-14' : 3, '0-4' : 1,
       '20-24' : 5, '45-49' : 10, '50-54' : 11, '5-9': 2}
# Change age column into map age range to integers
breast_cancer['tumor-size'] = breast_cancer['tumor-size'].map(tumor_range_mapping)


# Define a dictionary mapping inv-nodes range to integers
inv_range_mapping = {'0-2':1, '3-5':2, '15-17':6, '6-8':3, '9-11':4, '24-26':7, '12-14':5}
breast_cancer['inv-nodes'] = breast_cancer['inv-nodes'].map(inv_range_mapping)

# Define a dictionary mapping breast-quad sections into integers
breast_range_mapping = {'left_up' : 1, 'central' : 2, 'left_low' : 3, 'right_up' : 4, 'right_low' : 5}
breast_cancer['breast-quad'] = breast_cancer['breast-quad'].map(breast_range_mapping)
breast_cancer

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,3.0,1,4,1,1.0,3,1,1.0,0,recurrence-events
1,4.0,2,4,1,0.0,1,1,2.0,0,no-recurrence-events
2,4.0,2,8,1,0.0,2,0,3.0,0,recurrence-events
3,3.0,1,8,1,1.0,3,1,3.0,1,no-recurrence-events
4,3.0,1,7,2,1.0,2,0,4.0,0,recurrence-events
...,...,...,...,...,...,...,...,...,...,...
281,4.0,2,7,3,1.0,2,0,3.0,0,no-recurrence-events
282,4.0,1,6,2,1.0,2,0,3.0,1,no-recurrence-events
283,2.0,1,7,3,1.0,2,1,4.0,0,no-recurrence-events
284,4.0,1,4,1,0.0,2,1,3.0,0,no-recurrence-events


In [18]:
breast_cancer.dropna(inplace = True)

In [19]:
breast_cancer.info()  # Total rows are 263 with no duplicate rows and no empty cells.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          207 non-null    float64
 1   menopause    207 non-null    int64  
 2   tumor-size   207 non-null    int64  
 3   inv-nodes    207 non-null    int64  
 4   node-caps    207 non-null    float64
 5   deg-malig    207 non-null    int64  
 6   breast       207 non-null    int64  
 7   breast-quad  207 non-null    float64
 8   irradiat     207 non-null    int64  
 9   Class        207 non-null    object 
dtypes: float64(3), int64(6), object(1)
memory usage: 17.8+ KB


In [20]:
# 1. get the data ready
# create x (Features matrix)
x = breast_cancer.drop('Class',axis = 1)
# create y (labels)
y = breast_cancer.Class

In [21]:
#2. choose the right model & hyperparameters

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
# we will keep the default hyperparameters

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [22]:
#3. Fit the Model to the training data

from sklearn.model_selection import train_test_split

x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
clf.fit(x_train,y_train);

#make a prediction
y_preds = clf.predict(x_test)
y_preds

array(['recurrence-events', 'recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'recurrence-events', 'no-recurrence-events',
       'no-recurrence-even

In [23]:
y_test

270       recurrence-events
74        recurrence-events
14        recurrence-events
43        recurrence-events
233    no-recurrence-events
5      no-recurrence-events
188    no-recurrence-events
256    no-recurrence-events
220    no-recurrence-events
102    no-recurrence-events
52     no-recurrence-events
271       recurrence-events
40        recurrence-events
275    no-recurrence-events
281    no-recurrence-events
99        recurrence-events
258    no-recurrence-events
107       recurrence-events
157       recurrence-events
96        recurrence-events
25     no-recurrence-events
213    no-recurrence-events
255       recurrence-events
158    no-recurrence-events
148    no-recurrence-events
141    no-recurrence-events
210       recurrence-events
223    no-recurrence-events
9      no-recurrence-events
277    no-recurrence-events
170       recurrence-events
206    no-recurrence-events
163    no-recurrence-events
279    no-recurrence-events
160    no-recurrence-events
122    no-recurrence

In [24]:
#4. Evalute the model on the training data & test data

clf.score(x_train,y_train)

0.9696969696969697

In [25]:
clf.score(x_test,y_test)

0.7142857142857143

In [26]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test,y_preds))

                      precision    recall  f1-score   support

no-recurrence-events       0.77      0.82      0.79        28
   recurrence-events       0.58      0.50      0.54        14

            accuracy                           0.71        42
           macro avg       0.68      0.66      0.67        42
        weighted avg       0.71      0.71      0.71        42



In [27]:
confusion_matrix(y_test,y_preds)

array([[23,  5],
       [ 7,  7]], dtype=int64)

In [28]:
accuracy_score(y_test,y_preds)

0.7142857142857143

In [29]:
#5. Improve a model
#Try different amount of n_estimators

np.random.seed(42)
for i in range(10,100,10):
    print(f'Trying model with {i} estimatore...')
    clf = RandomForestClassifier(n_estimators = i).fit(x_train,y_train)
    print(f'model accuracy on test set : {clf.score(x_test,y_test)*100:2f}%')
    print('')

Trying model with 10 estimatore...
model accuracy on test set : 73.809524%

Trying model with 20 estimatore...
model accuracy on test set : 78.571429%

Trying model with 30 estimatore...
model accuracy on test set : 76.190476%

Trying model with 40 estimatore...
model accuracy on test set : 78.571429%

Trying model with 50 estimatore...
model accuracy on test set : 69.047619%

Trying model with 60 estimatore...
model accuracy on test set : 76.190476%

Trying model with 70 estimatore...
model accuracy on test set : 71.428571%

Trying model with 80 estimatore...
model accuracy on test set : 73.809524%

Trying model with 90 estimatore...
model accuracy on test set : 69.047619%

