© Ritwik Chandra Pandey


2nd MSc (Maths) - Specialisation in CS

##IMPORTING LIBRARIES

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix,roc_auc_score, recall_score,precision_score,accuracy_score

##IMPORTING THE DATASET

In [2]:
from sklearn.datasets import load_breast_cancer
df = pd.DataFrame(load_breast_cancer()['data'], columns=load_breast_cancer()['feature_names'])
df['y'] = load_breast_cancer()['target']

## BASIC INFORMATION ABOUT THE DATASET

### Number of Null values in each column of df

In [3]:
pd.set_option('display.max_rows', None)
df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
y                          0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [5]:
#Describing the label to be predicted in train.scv
df['y'].describe()

count    569.000000
mean       0.627417
std        0.483918
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: y, dtype: float64

In [6]:
#Getting the frequency of each value in df['y']
M = df['y'].value_counts()[0]
B = df['y'].value_counts()[1]
Total = B+M
print('Benign values in the dataset : ' + str(B))
print('Malignant values in the dataset : ' + str(M))
print('Percent of Benign values in the dataset: ' + str((B/Total)*100) + " %")
print('Percent of Malignant values in the dataset: ' + str((M/Total)*100) + " %")

Benign values in the dataset : 357
Malignant values in the dataset : 212
Percent of Benign values in the dataset: 62.741652021089635 %
Percent of Malignant values in the dataset: 37.258347978910365 %


Malignant is represented using 0 and Benign is represented using 1 in the dataset.

In [7]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## DATA PREPROCESSING

### Feature Scaling

In [8]:
target = df['y']
df_dropped = df.drop('y', axis = 1)
sc = StandardScaler()
df_dropped = sc.fit_transform(df_dropped)

## SPLITTING THE DATASET

In [9]:
X, y = df_dropped, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=118)

## HANDLING DATA IMBALANCE USING SMOTE

In [10]:
print("Before OverSampling, counts of label '1' - Benign: {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0' - Malignant: {} \n".format(sum(y_train==0)))
print('Before OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X_res: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y_res: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1'  - Benign: {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0' - Malignant: {}".format(sum(y_train_res==0)))

Before OverSampling, counts of label '1' - Benign: 246
Before OverSampling, counts of label '0' - Malignant: 152 

Before OverSampling, the shape of train_X: (398, 30)
Before OverSampling, the shape of train_y: (398,) 

After OverSampling, the shape of train_X_res: (492, 30)
After OverSampling, the shape of train_y_res: (492,) 

After OverSampling, counts of label '1'  - Benign: 246
After OverSampling, counts of label '0' - Malignant: 246


## CLASSIFICATION USING DECISION TREE

#### Without OverSampling

In [39]:
clf = DecisionTreeClassifier()

#Tuning the hyperparameters of the Decision tree model
dt_tuned_parameters = {'criterion':['gini','entropy'],'max_depth':[10,20,50,100],'min_samples_leaf':[10, 20, 50]}

#Fitting the model
cv_grid = GridSearchCV(clf, param_grid = dt_tuned_parameters, scoring = 'roc_auc', verbose = 5) 
cv_grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.966 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.958 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.980 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.944 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.951 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.933 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.945 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.976 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.956 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.9

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [10, 20, 50]},
             scoring='roc_auc', verbose=5)

In [40]:
cv_grid.cv_results_

{'mean_fit_time': array([0.00554209, 0.00501809, 0.00436139, 0.01637173, 0.01603966,
        0.01729293, 0.01212626, 0.00714569, 0.00426068, 0.0063693 ,
        0.0051589 , 0.00422792, 0.00786672, 0.00696802, 0.00657454,
        0.00640774, 0.00660801, 0.00537214, 0.01530175, 0.02157583,
        0.02635627, 0.02514181, 0.01650834, 0.00507154]),
 'std_fit_time': array([0.00025605, 0.000148  , 0.00016809, 0.00673807, 0.00077197,
        0.00628745, 0.00761601, 0.00296657, 0.00047773, 0.00069622,
        0.00022979, 0.00010768, 0.00133557, 0.00028472, 0.0018627 ,
        0.00030832, 0.00043539, 0.00017637, 0.01732225, 0.00214693,
        0.00814322, 0.00675629, 0.00790192, 0.00046063]),
 'mean_score_time': array([0.00178471, 0.00165973, 0.00169415, 0.00656853, 0.00469975,
        0.00457921, 0.00662012, 0.00270371, 0.00658174, 0.00195689,
        0.00163269, 0.00158682, 0.00181365, 0.00186367, 0.00191536,
        0.00182743, 0.00177984, 0.00173469, 0.00302873, 0.01416206,
        0.005155

In [41]:
best_parameters = cv_grid.best_estimator_.get_params()
best_parameters

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 100,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 20,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

#### With OverSampling

In [42]:
clf = DecisionTreeClassifier()

#Tuning the hyperparameters of the Decision tree model
dt_tuned_parameters = {'criterion':['gini','entropy'],'max_depth':[10,20,50,100],'min_samples_leaf':[10, 20, 50]}

#Fitting the model
cv_grid_oversampled = GridSearchCV(clf, param_grid = dt_tuned_parameters, scoring = 'roc_auc', verbose = 5) 
cv_grid_oversampled.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.952 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.963 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.994 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.984 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.966 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.953 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.971 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.993 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.962 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.9

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [10, 20, 50]},
             scoring='roc_auc', verbose=5)

In [43]:
cv_grid_oversampled.cv_results_

{'mean_fit_time': array([0.00660648, 0.00508876, 0.00393057, 0.00553107, 0.00557461,
        0.0040194 , 0.00567784, 0.00515876, 0.00391397, 0.00567999,
        0.00525708, 0.00413475, 0.00753341, 0.00673981, 0.00519757,
        0.00805235, 0.00829859, 0.00502181, 0.00761256, 0.00655208,
        0.00531836, 0.00770679, 0.00676827, 0.00522814]),
 'std_fit_time': array([2.43799813e-03, 2.46418279e-04, 1.43377409e-04, 1.80076970e-04,
        7.10815613e-04, 5.92951380e-04, 2.69150378e-04, 1.98671419e-04,
        1.35079189e-04, 2.96807771e-04, 4.28912609e-04, 3.22763728e-04,
        5.88626388e-04, 3.07218207e-04, 2.21484695e-04, 1.01811271e-03,
        1.43406314e-03, 4.30591531e-04, 5.23937326e-04, 3.21377371e-04,
        5.54884921e-04, 5.81522154e-04, 2.71869717e-04, 8.57370714e-05]),
 'mean_score_time': array([0.00146341, 0.00146527, 0.00127444, 0.00135245, 0.00183411,
        0.0014905 , 0.00136781, 0.00138226, 0.00135126, 0.00136008,
        0.00150051, 0.00163426, 0.00151939, 0.00

In [44]:
best_parameters = cv_grid_oversampled.best_estimator_.get_params()
best_parameters

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 50,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 20,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [45]:
pred_test_dt_tuned = cv_grid_oversampled.predict(X_test)
print("recall score on validation data: " + str(recall_score(y_test, pred_test_dt_tuned)))     
print("precision score on validation data: " + str(precision_score(y_test, pred_test_dt_tuned)))  
print("roc auc score on validation data: " + str(roc_auc_score(y_test, pred_test_dt_tuned)))  
print("accuracy score on validation data: " + str(accuracy_score(y_test, pred_test_dt_tuned)))  
print("confusion matrix on validation data: \n" + str(confusion_matrix(y_test, pred_test_dt_tuned)))
print('\n\n\n')
pred_test_dt_tuned_oversampled = cv_grid.predict(X_test)
print("recall score on validation data (with oversampling): " + str(recall_score(y_test, pred_test_dt_tuned_oversampled)))     
print("precision score on validation data (with oversampling): " + str(precision_score(y_test, pred_test_dt_tuned_oversampled)))  
print("roc auc score on validation data (with oversampling): " + str(roc_auc_score(y_test, pred_test_dt_tuned_oversampled)))  
print("accuracy score on validation data (with oversampling): " + str(accuracy_score(y_test, pred_test_dt_tuned_oversampled)))  
print("confusion matrix on validation data (with oversampling): \n" + str(confusion_matrix(y_test, pred_test_dt_tuned_oversampled)))

recall score on validation data: 0.954954954954955
precision score on validation data: 0.9636363636363636
roc auc score on validation data: 0.9441441441441442
accuracy score on validation data: 0.9473684210526315
confusion matrix on validation data: 
[[ 56   4]
 [  5 106]]




recall score on validation data (with oversampling): 0.9819819819819819
precision score on validation data (with oversampling): 0.9478260869565217
roc auc score on validation data (with oversampling): 0.940990990990991
accuracy score on validation data (with oversampling): 0.9532163742690059
confusion matrix on validation data (with oversampling): 
[[ 54   6]
 [  2 109]]


##### It is noted that handling data imbalance gives slightly better results.

## CLASSIFICATION USING K-NEAREST NEIGHBOR

#### Without OverSampling

#### Model Fitting for K = 3

In [38]:
model3 = KNeighborsClassifier(n_neighbors=3)
model3.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

#### Model Fitting for K = 5

In [46]:
model5 = KNeighborsClassifier(n_neighbors=5)
model5.fit(X_train,y_train)

KNeighborsClassifier()

#### Model Fitting for K = 7

In [47]:
model7 = KNeighborsClassifier(n_neighbors=7)
model7.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=7)

####With OverSampling
#### Model Fitting for K = 3

In [21]:
model3_os = KNeighborsClassifier(n_neighbors=3)
model3_os.fit(X_train_res,y_train_res)

KNeighborsClassifier(n_neighbors=3)

#### Model Fitting for K = 5

In [48]:
model5_os = KNeighborsClassifier(n_neighbors=5)
model5_os.fit(X_train_res,y_train_res)

KNeighborsClassifier()

#### Model Fitting for K = 7

In [49]:
model7_os = KNeighborsClassifier(n_neighbors=7)
model7_os.fit(X_train_res,y_train_res)

KNeighborsClassifier(n_neighbors=7)

#####Comparing all results of KNN models

In [50]:
#Predict Output
pred_test_knn_3 = model3.predict(X_test)
print("recall score on validation data k = 3: " + str(recall_score(y_test, pred_test_knn_3)))     
print("precision score on validation data k = 3: " + str(precision_score(y_test, pred_test_knn_3)))  
print("roc auc score on validation data k = 3: " + str(roc_auc_score(y_test, pred_test_knn_3)))    
print("accuracy score on validation data k = 3: " + str(accuracy_score(y_test, pred_test_knn_3)))
print("confusion matrix on validation data k = 3: \n" + str(confusion_matrix(y_test, pred_test_knn_3)))
#Predict Output
pred_test_knn_3_os = model3_os.predict(X_test)
print("recall score on validation data k = 3 (with oversampling): " + str(recall_score(y_test, pred_test_knn_3_os)))     
print("precision score on validation data k = 3 (with oversampling): " + str(precision_score(y_test, pred_test_knn_3_os)))  
print("roc auc score on validation data k = 3 (with oversampling): " + str(roc_auc_score(y_test, pred_test_knn_3_os)))    
print("accuracy score on validation data k = 3 (with oversampling): " + str(accuracy_score(y_test, pred_test_knn_3_os)))
print("confusion matrix on validation data k = 3 (with oversampling): \n" + str(confusion_matrix(y_test, pred_test_knn_3_os)))
print('\n\n\n')

#Predict Output
pred_test_knn_5 = model5.predict(X_test)
print("recall score on validation data k = 5: " + str(recall_score(y_test, pred_test_knn_5)))     
print("precision score on validation data k = 5: " + str(precision_score(y_test, pred_test_knn_5)))  
print("roc auc score on validation data k = 5: " + str(roc_auc_score(y_test, pred_test_knn_5)))    
print("accuracy score on validation data k = 5: " + str(accuracy_score(y_test, pred_test_knn_5)))
print("confusion matrix on validation data k = 5: \n" + str(confusion_matrix(y_test, pred_test_knn_5)))
#Predict Output
pred_test_knn_5_os = model5_os.predict(X_test)
print("recall score on validation data k = 5 (with oversampling): " + str(recall_score(y_test, pred_test_knn_5_os)))     
print("precision score on validation data k = 5 (with oversampling): " + str(precision_score(y_test, pred_test_knn_5_os)))  
print("roc auc score on validation data k = 5 (with oversampling): " + str(roc_auc_score(y_test, pred_test_knn_5_os)))    
print("accuracy score on validation data k = 5 (with oversampling): " + str(accuracy_score(y_test, pred_test_knn_5_os)))
print("confusion matrix on validation data k = 5 (with oversampling): \n" + str(confusion_matrix(y_test, pred_test_knn_5_os)))
print('\n\n\n')

#Predict Output
pred_test_knn_7 = model7.predict(X_test)
print("recall score on validation data k = 7: " + str(recall_score(y_test, pred_test_knn_7)))     
print("precision score on validation data k = 7: " + str(precision_score(y_test, pred_test_knn_7)))  
print("roc auc score on validation data k = 7: " + str(roc_auc_score(y_test, pred_test_knn_7))) 
print("accuracy score on validation data k = 7: " + str(accuracy_score(y_test, pred_test_knn_7)))   
print("confusion matrix on validation data k = 7: \n" + str(confusion_matrix(y_test, pred_test_knn_7)))
#Predict Output
pred_test_knn_7_os = model7_os.predict(X_test)
print("recall score on validation data k = 7 (with oversampling): " + str(recall_score(y_test, pred_test_knn_7_os)))     
print("precision score on validation data k = 7 (with oversampling): " + str(precision_score(y_test, pred_test_knn_7_os)))  
print("roc auc score on validation data k = 7 (with oversampling): " + str(roc_auc_score(y_test, pred_test_knn_7_os)))    
print("accuracy score on validation data k = 7 (with oversampling): " + str(accuracy_score(y_test, pred_test_knn_7_os)))
print("confusion matrix on validation data k = 7 (with oversampling): \n" + str(confusion_matrix(y_test, pred_test_knn_7_os)))

recall score on validation data k = 3: 0.990990990990991
precision score on validation data k = 3: 0.9649122807017544
roc auc score on validation data k = 3: 0.9621621621621622
accuracy score on validation data k = 3: 0.9707602339181286
confusion matrix on validation data k = 3: 
[[ 56   4]
 [  1 110]]
recall score on validation data k = 3 (with oversampling): 0.972972972972973
precision score on validation data k = 3 (with oversampling): 0.9818181818181818
roc auc score on validation data k = 3 (with oversampling): 0.9698198198198198
accuracy score on validation data k = 3 (with oversampling): 0.9707602339181286
confusion matrix on validation data k = 3 (with oversampling): 
[[ 58   2]
 [  3 108]]




recall score on validation data k = 5: 0.990990990990991
precision score on validation data k = 5: 0.9649122807017544
roc auc score on validation data k = 5: 0.9621621621621622
accuracy score on validation data k = 5: 0.9707602339181286
confusion matrix on validation data k = 5: 
[[ 56  

It is noted that oversampling here does not help much.



## CLASSIFICATION USING GRADIENT BOOSTING MACHINE (GBM)

#### Without OverSampling

In [25]:
#Learning rate = 0.1
gradient_booster_1 = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster_1.fit(X_train,y_train)
pred_test_gb_1 = gradient_booster_1.predict(X_test)

In [26]:
#Learning rate = 0.2
gradient_booster_2 = GradientBoostingClassifier(learning_rate=0.2)
gradient_booster_2.fit(X_train,y_train)
pred_test_gb_2 = gradient_booster_2.predict(X_test)

In [27]:
#Learning rate = 0.3
gradient_booster_3 = GradientBoostingClassifier(learning_rate=0.3)
gradient_booster_3.fit(X_train,y_train)
pred_test_gb_3 = gradient_booster_3.predict(X_test)

#### With OverSampling

In [51]:
#Learning rate = 0.1
gradient_booster_os_1 = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster_os_1.fit(X_train_res,y_train_res)
pred_test_gb_os_1 = gradient_booster_os_1.predict(X_test)

In [52]:
#Learning rate = 0.2
gradient_booster_os_2 = GradientBoostingClassifier(learning_rate=0.2)
gradient_booster_os_2.fit(X_train_res,y_train_res)
pred_test_gb_os_2 = gradient_booster_os_2.predict(X_test)

In [53]:
#Learning rate = 0.3
gradient_booster_os_3= GradientBoostingClassifier(learning_rate=0.3)
gradient_booster_os_3.fit(X_train_res,y_train_res)
pred_test_gb_os_3 = gradient_booster_os_3.predict(X_test)

In [54]:
print("recall score on validation data (learning rate = 0.1): " + str(recall_score(y_test, pred_test_gb_1)))     
print("precision score on validation data (learning rate = 0.1): " + str(precision_score(y_test, pred_test_gb_1)))  
print("roc auc score on validation data (learning rate = 0.1): " + str(roc_auc_score(y_test, pred_test_gb_1)))  
print("accuracy score on validation data (learning rate = 0.1): " + str(accuracy_score(y_test, pred_test_gb_1)))  
print("confusion matrix on validation data (learning rate = 0.1): \n" + str(confusion_matrix(y_test, pred_test_gb_1)))

print("recall score on validation data (learning rate = 0.1 -> with oversampling): " + str(recall_score(y_test, pred_test_gb_os_1)))     
print("precision score on validation data (learning rate = 0.1 -> with oversampling): " + str(precision_score(y_test, pred_test_gb_os_1)))  
print("roc auc score on validation data (learning rate = 0.1 -> with oversampling): " + str(roc_auc_score(y_test, pred_test_gb_os_1)))  
print("accuracy score on validation data (learning rate = 0.1 -> with oversampling): " + str(accuracy_score(y_test, pred_test_gb_os_1)))  
print("confusion matrix on validation data (learning rate = 0.1 -> with oversampling): \n" + str(confusion_matrix(y_test, pred_test_gb_os_1)))

print('\n\n\n')

print("recall score on validation data (learning rate = 0.2): " + str(recall_score(y_test, pred_test_gb_2)))     
print("precision score on validation data (learning rate = 0.2): " + str(precision_score(y_test, pred_test_gb_2)))  
print("roc auc score on validation data (learning rate = 0.2): " + str(roc_auc_score(y_test, pred_test_gb_2)))  
print("accuracy score on validation data (learning rate = 0.2): " + str(accuracy_score(y_test, pred_test_gb_2)))  
print("confusion matrix on validation data (learning rate = 0.2): \n" + str(confusion_matrix(y_test, pred_test_gb_2)))

print("recall score on validation data (learning rate = 0.2 -> with oversampling): " + str(recall_score(y_test, pred_test_gb_os_2)))     
print("precision score on validation data (learning rate = 0.2 -> with oversampling): " + str(precision_score(y_test, pred_test_gb_os_2)))  
print("roc auc score on validation data (learning rate = 0.2 -> with oversampling): " + str(roc_auc_score(y_test, pred_test_gb_os_2)))  
print("accuracy score on validation data (learning rate = 0.2 -> with oversampling): " + str(accuracy_score(y_test, pred_test_gb_os_2)))  
print("confusion matrix on validation data (learning rate = 0.2 -> with oversampling): \n" + str(confusion_matrix(y_test, pred_test_gb_os_2)))

print('\n\n\n')

print("recall score on validation data (learning rate = 0.3): " + str(recall_score(y_test, pred_test_gb_3)))     
print("precision score on validation data (learning rate = 0.3): " + str(precision_score(y_test, pred_test_gb_3)))  
print("roc auc score on validation data (learning rate = 0.3): " + str(roc_auc_score(y_test, pred_test_gb_3)))  
print("accuracy score on validation data (learning rate = 0.3): " + str(accuracy_score(y_test, pred_test_gb_3)))  
print("confusion matrix on validation data (learning rate = 0.3): \n" + str(confusion_matrix(y_test, pred_test_gb_3)))

print("recall score on validation data (learning rate = 0.3 -> with oversampling): " + str(recall_score(y_test, pred_test_gb_os_3)))     
print("precision score on validation data (learning rate = 0.3 -> with oversampling): " + str(precision_score(y_test, pred_test_gb_os_3)))  
print("roc auc score on validation data (learning rate = 0.3 -> with oversampling): " + str(roc_auc_score(y_test, pred_test_gb_os_3)))  
print("accuracy score on validation data (learning rate = 0.3 -> with oversampling): " + str(accuracy_score(y_test, pred_test_gb_os_3)))  
print("confusion matrix on validation data (learning rate = 0.3 -> with oversampling): \n" + str(confusion_matrix(y_test, pred_test_gb_os_3)))

recall score on validation data (learning rate = 0.1): 0.990990990990991
precision score on validation data (learning rate = 0.1): 0.9649122807017544
roc auc score on validation data (learning rate = 0.1): 0.9621621621621622
accuracy score on validation data (learning rate = 0.1): 0.9707602339181286
confusion matrix on validation data (learning rate = 0.1): 
[[ 56   4]
 [  1 110]]
recall score on validation data (learning rate = 0.1 -> with oversampling): 0.990990990990991
precision score on validation data (learning rate = 0.1 -> with oversampling): 0.9649122807017544
roc auc score on validation data (learning rate = 0.1 -> with oversampling): 0.9621621621621622
accuracy score on validation data (learning rate = 0.1 -> with oversampling): 0.9707602339181286
confusion matrix on validation data (learning rate = 0.1 -> with oversampling): 
[[ 56   4]
 [  1 110]]




recall score on validation data (learning rate = 0.2): 0.990990990990991
precision score on validation data (learning rate 

It is noted that oversampling here does not help much.

## CLASSIFICATION USING RANDOM FOREST

####Without OverSampling

In [32]:
estimator = RandomForestClassifier(random_state=0, warm_start = True)
estimator.fit(X_train,y_train)
pred_test_rf = estimator.predict(X_test)

####With OverSampling

In [55]:
estimator_os = RandomForestClassifier(random_state=0, warm_start = True)
estimator_os.fit(X_train_res,y_train_res)
pred_test_rf_os = estimator_os.predict(X_test)

In [56]:
print("recall score on validation data: " + str(recall_score(y_test, pred_test_rf)))     
print("precision score on validation data: " + str(precision_score(y_test, pred_test_rf)))  
print("roc auc score on validation data: " + str(roc_auc_score(y_test, pred_test_rf)))  
print("accuracy score on validation data: " + str(accuracy_score(y_test, pred_test_rf)))  
print("confusion matrix on validation data: \n" + str(confusion_matrix(y_test, pred_test_rf)))
print('\n\n\n')
print("recall score on validation data (with oversampling): " + str(recall_score(y_test, pred_test_rf_os)))     
print("precision score on validation data (with oversampling): " + str(precision_score(y_test, pred_test_rf_os)))  
print("roc auc score on validation data (with oversampling): " + str(roc_auc_score(y_test, pred_test_rf_os)))  
print("accuracy score on validation data (with oversampling): " + str(accuracy_score(y_test, pred_test_rf_os)))  
print("confusion matrix on validation data (with oversampling): \n" + str(confusion_matrix(y_test, pred_test_rf_os)))

recall score on validation data: 0.990990990990991
precision score on validation data: 0.9565217391304348
roc auc score on validation data: 0.9538288288288288
accuracy score on validation data: 0.9649122807017544
confusion matrix on validation data: 
[[ 55   5]
 [  1 110]]




recall score on validation data (with oversampling): 0.990990990990991
precision score on validation data (with oversampling): 0.9734513274336283
roc auc score on validation data (with oversampling): 0.9704954954954955
accuracy score on validation data (with oversampling): 0.9766081871345029
confusion matrix on validation data (with oversampling): 
[[ 57   3]
 [  1 110]]


It is noted that oversampling improves the predictions to a marginal extent.

### COMPARING ALL MODELS

In [35]:
df_recall = pd.DataFrame(data = [recall_score(y_test, pred_test_dt_tuned),recall_score(y_test, pred_test_knn_3)\
                                 , recall_score(y_test, pred_test_knn_5),\
                             recall_score(y_test, pred_test_knn_7),\
                             recall_score(y_test, pred_test_gb_1),recall_score(y_test, pred_test_gb_2)\
                             ,recall_score(y_test, pred_test_gb_3),\
                             recall_score(y_test, pred_test_rf)], \
                     index = ['Decision Tree', 'KNN (K = 3)'\
                              ,'KNN (K = 5)','KNN (K = 7)','GB (LR = 0.1)','GB (LR = 0.2)','GB (LR = 0.3)',\
                              'Random Forest'],
                     columns = ['Recall Score'])
df_roc = pd.DataFrame(data = [roc_auc_score(y_test, pred_test_dt_tuned),roc_auc_score(y_test, pred_test_knn_3)\
                                 , roc_auc_score(y_test, pred_test_knn_5),\
                             roc_auc_score(y_test, pred_test_knn_7),\
                             roc_auc_score(y_test, pred_test_gb_1),roc_auc_score(y_test, pred_test_gb_2)\
                             ,roc_auc_score(y_test, pred_test_gb_3),\
                             roc_auc_score(y_test, pred_test_rf)], \
                     index = ['Decision Tree', 'KNN (K = 3)'\
                              ,'KNN (K = 5)','KNN (K = 7)','GB (LR = 0.1)','GB (LR = 0.2)','GB (LR = 0.3)',\
                              'Random Forest'],
                     columns = ['ROC AUC Score'])
df_precision = pd.DataFrame(data = [precision_score(y_test, pred_test_dt_tuned),precision_score(y_test, pred_test_knn_3)\
                                 , precision_score(y_test, pred_test_knn_5),\
                             precision_score(y_test, pred_test_knn_7),\
                             precision_score(y_test, pred_test_gb_1),precision_score(y_test, pred_test_gb_2)\
                             ,precision_score(y_test, pred_test_gb_3),\
                             precision_score(y_test, pred_test_rf)], \
                     index = ['Decision Tree', 'KNN (K = 3)'\
                              ,'KNN (K = 5)','KNN (K = 7)','GB (LR = 0.1)','GB (LR = 0.2)','GB (LR = 0.3)',\
                              'Random Forest'],
                     columns = ['Precision Score'])
df_accuracy =  pd.DataFrame(data = [accuracy_score(y_test, pred_test_dt_tuned),accuracy_score(y_test, pred_test_knn_3)\
                                 , accuracy_score(y_test, pred_test_knn_5),\
                             accuracy_score(y_test, pred_test_knn_7),\
                             accuracy_score(y_test, pred_test_gb_1),accuracy_score(y_test, pred_test_gb_2)\
                             ,accuracy_score(y_test, pred_test_gb_3),\
                             accuracy_score(y_test, pred_test_rf)], \
                     index = ['Decision Tree', 'KNN (K = 3)'\
                              ,'KNN (K = 5)','KNN (K = 7)','GB (LR = 0.1)','GB (LR = 0.2)','GB (LR = 0.3)',\
                              'Random Forest'],
                     columns = ['Accuracy Score'])

In [36]:
df_recall_os =  pd.DataFrame(data = [accuracy_score(y_test, pred_test_dt_tuned_oversampled),recall_score(y_test, pred_test_knn_3_os)\
                                 , recall_score(y_test, pred_test_knn_5_os),\
                             accuracy_score(y_test, pred_test_knn_7_os),\
                             recall_score(y_test, pred_test_gb_os_1),recall_score(y_test, pred_test_gb_os_3)\
                             ,recall_score(y_test, pred_test_gb_os_3),\
                             recall_score(y_test, pred_test_rf_os)], \
                     index = ['Decision Tree (OverSampled)', 'KNN (K = 3) (OverSampled)'\
                              ,'KNN (K = 5) (OverSampled)','KNN (K = 7) (OverSampled)','GB (LR = 0.1) (OverSampled)',\
                              'GB (LR = 0.2) (OverSampled)','GB (LR = 0.3) (OverSampled)',\
                              'Random Forest (OverSampled)'],
                     columns = ['Recall Score'])

df_roc_os = pd.DataFrame(data = [roc_auc_score(y_test, pred_test_dt_tuned_oversampled),roc_auc_score(y_test, pred_test_knn_3_os)\
                                 , roc_auc_score(y_test, pred_test_knn_5_os),\
                             roc_auc_score(y_test, pred_test_knn_7_os),\
                             roc_auc_score(y_test, pred_test_gb_os_1),roc_auc_score(y_test, pred_test_gb_os_2)\
                             ,roc_auc_score(y_test, pred_test_gb_os_3),\
                             roc_auc_score(y_test, pred_test_rf_os)], \
                     index = ['Decision Tree (OverSampled)', 'KNN (K = 3) (OverSampled)'\
                              ,'KNN (K = 5) (OverSampled)','KNN (K = 7) (OverSampled)','GB (LR = 0.1) (OverSampled)',\
                              'GB (LR = 0.2) (OverSampled)','GB (LR = 0.3) (OverSampled)',\
                              'Random Forest (OverSampled)'],
                     columns = ['ROC AUC Score'])

df_precision_os = pd.DataFrame(data = [precision_score(y_test, pred_test_dt_tuned_oversampled),precision_score(y_test, pred_test_knn_3_os)\
                                 , precision_score(y_test, pred_test_knn_5_os),\
                             precision_score(y_test, pred_test_knn_7_os),\
                             precision_score(y_test, pred_test_gb_os_1),precision_score(y_test, pred_test_gb_os_2)\
                             ,precision_score(y_test, pred_test_gb_os_3),\
                             precision_score(y_test, pred_test_rf_os)], \
                    index = ['Decision Tree (OverSampled)', 'KNN (K = 3) (OverSampled)'\
                              ,'KNN (K = 5) (OverSampled)','KNN (K = 7) (OverSampled)','GB (LR = 0.1) (OverSampled)',\
                              'GB (LR = 0.2) (OverSampled)','GB (LR = 0.3) (OverSampled)',\
                              'Random Forest (OverSampled)'],
                     columns = ['Precision Score'])

df_accuracy_os =  pd.DataFrame(data = [accuracy_score(y_test, pred_test_dt_tuned_oversampled),accuracy_score(y_test, pred_test_knn_3_os)\
                                 , accuracy_score(y_test, pred_test_knn_5_os),\
                             accuracy_score(y_test, pred_test_knn_7_os),\
                             accuracy_score(y_test, pred_test_gb_os_1),accuracy_score(y_test, pred_test_gb_os_2)\
                             ,accuracy_score(y_test, pred_test_gb_os_3),\
                             accuracy_score(y_test, pred_test_rf_os)], \
                     index = ['Decision Tree (OverSampled)', 'KNN (K = 3) (OverSampled)'\
                              ,'KNN (K = 5) (OverSampled)','KNN (K = 7) (OverSampled)','GB (LR = 0.1) (OverSampled)',\
                              'GB (LR = 0.2) (OverSampled)','GB (LR = 0.3) (OverSampled)',\
                              'Random Forest (OverSampled)'],
                     columns = ['Accuracy Score'])

In [57]:
df_all = pd.concat([df_recall, df_roc, df_precision, df_accuracy], axis = 1)
display(df_all)
df_all_os = pd.concat([df_recall_os, df_roc_os, df_precision_os, df_accuracy_os], axis = 1)
display(df_all_os)

Unnamed: 0,Recall Score,ROC AUC Score,Precision Score,Accuracy Score
Decision Tree,0.954955,0.944144,0.963636,0.947368
KNN (K = 3),0.990991,0.962162,0.964912,0.97076
KNN (K = 5),0.990991,0.962162,0.964912,0.97076
KNN (K = 7),0.990991,0.953829,0.956522,0.964912
GB (LR = 0.1),0.990991,0.962162,0.964912,0.97076
GB (LR = 0.2),0.990991,0.970495,0.973451,0.976608
GB (LR = 0.3),0.990991,0.953829,0.956522,0.964912
Random Forest,0.990991,0.953829,0.956522,0.964912


Unnamed: 0,Recall Score,ROC AUC Score,Precision Score,Accuracy Score
Decision Tree (OverSampled),0.953216,0.94482,0.955752,0.953216
KNN (K = 3) (OverSampled),0.972973,0.96982,0.981818,0.97076
KNN (K = 5) (OverSampled),0.972973,0.96982,0.981818,0.97076
KNN (K = 7) (OverSampled),0.964912,0.961486,0.972973,0.964912
GB (LR = 0.1) (OverSampled),0.990991,0.970495,0.973451,0.976608
GB (LR = 0.2) (OverSampled),0.990991,0.965991,0.973214,0.97076
GB (LR = 0.3) (OverSampled),0.990991,0.970495,0.973451,0.976608
Random Forest (OverSampled),0.990991,0.970495,0.973451,0.976608
