# Decision Tree Classifier

In [44]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
df = pd.DataFrame(load_breast_cancer()['data'], 
columns=load_breast_cancer()['feature_names']) 
df['y'] = load_breast_cancer()['target'] 

In [45]:
df.tail(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [46]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.model_selection import RandomizedSearchCV

In [47]:
X = df.iloc[:, :-1] 
y = df.iloc[:, -1]

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #splitting the data into test and train

In [49]:
clf = DecisionTreeClassifier()
param_dist = {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf':
[10, 20, 50]}
clf_cv = RandomizedSearchCV(clf, param_dist, cv = 5)
clf_cv.fit(X_train,y_train) #training
y_preddtc = clf_cv.predict(X_test) #prediction
y_train_pred = clf_cv.predict(X_train)


# Gradient Boosting Classification


In [50]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [52]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y) #splitting the data into train and test

In [53]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster.get_params()
gradient_booster.fit(Xtrain,ytrain)
print(classification_report(ytest,gradient_booster.predict(Xtest)))
from sklearn.model_selection import GridSearchCV
estimator = GradientBoostingClassifier()
rf_tuned_parameters = {"max_depth": [10, 20, 50, 100], 'n_estimators': [50, 100, 200, 500], 'min_samples_leaf': [10, 20, 50], 'learning_rate': [0.1, 0.01]}
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)


              precision    recall  f1-score   support

           0       0.96      0.91      0.93        70
           1       0.95      0.97      0.96       118

    accuracy                           0.95       188
   macro avg       0.95      0.94      0.95       188
weighted avg       0.95      0.95      0.95       188

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END learning_rate=0.1, max_depth=10, min_samples_leaf=10, n_estimators=50;, score=0.992 total time=   0.1s
[CV 2/5] END learning_rate=0.1, max_depth=10, min_samples_leaf=10, n_estimators=50;, score=0.977 total time=   0.1s
[CV 3/5] END learning_rate=0.1, max_depth=10, min_samples_leaf=10, n_estimators=50;, score=0.999 total time=   0.1s
[CV 4/5] END learning_rate=0.1, max_depth=10, min_samples_leaf=10, n_estimators=50;, score=0.974 total time=   0.1s
[CV 5/5] END learning_rate=0.1, max_depth=10, min_samples_leaf=10, n_estimators=50;, score=0.985 total time=   0.1s
[CV 1/5] END learning_rate=0.1,

[CV 3/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=100;, score=0.999 total time=   0.3s
[CV 4/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=100;, score=0.978 total time=   0.3s
[CV 5/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=100;, score=0.982 total time=   0.3s
[CV 1/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=200;, score=0.996 total time=   0.7s
[CV 2/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=200;, score=0.980 total time=   0.8s
[CV 3/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=200;, score=0.999 total time=   0.7s
[CV 4/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=200;, score=0.980 total time=   0.8s
[CV 5/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=10, n_estimators=200;, score=0.982 total time=   0.7s
[CV 1/5] END learning_rate=0.1, max_depth=20, min_samples_leaf=1

[CV 4/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=10, n_estimators=500;, score=0.979 total time=   1.0s
[CV 5/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=10, n_estimators=500;, score=0.981 total time=   0.9s
[CV 1/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=50;, score=0.994 total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=50;, score=0.979 total time=   0.0s
[CV 3/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=50;, score=0.999 total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=50;, score=0.972 total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=50;, score=0.985 total time=   0.0s
[CV 1/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_estimators=100;, score=0.994 total time=   0.2s
[CV 2/5] END learning_rate=0.1, max_depth=50, min_samples_leaf=20, n_

[CV 4/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=100;, score=0.971 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=100;, score=0.984 total time=   0.2s
[CV 1/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=200;, score=0.994 total time=   0.4s
[CV 2/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=200;, score=0.984 total time=   0.5s
[CV 3/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=200;, score=0.999 total time=   0.4s
[CV 4/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=200;, score=0.980 total time=   0.4s
[CV 5/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=200;, score=0.979 total time=   0.4s
[CV 1/5] END learning_rate=0.1, max_depth=100, min_samples_leaf=20, n_estimators=500;, score=0.996 total time=   0.8s
[CV 2/5] END learning_rate=0.1, max_depth=100, min_sampl

[CV 4/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=20, n_estimators=500;, score=0.967 total time=   1.2s
[CV 5/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=20, n_estimators=500;, score=0.986 total time=   1.2s
[CV 1/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=50;, score=0.991 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=50;, score=0.969 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=50;, score=0.993 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=50;, score=0.951 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=50;, score=0.967 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=10, min_samples_leaf=50, n_estimators=100;, score=0.994 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=10, min_samples_le

[CV 5/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=100;, score=0.969 total time=   0.1s
[CV 1/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=200;, score=0.996 total time=   0.2s
[CV 2/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=200;, score=0.984 total time=   0.2s
[CV 3/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=200;, score=0.999 total time=   0.2s
[CV 4/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=200;, score=0.965 total time=   0.2s
[CV 5/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=200;, score=0.974 total time=   0.2s
[CV 1/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=500;, score=0.998 total time=   0.6s
[CV 2/5] END learning_rate=0.01, max_depth=20, min_samples_leaf=50, n_estimators=500;, score=0.980 total time=   0.7s
[CV 3/5] END learning_rate=0.01, max_depth=20, min_sampl

[CV 5/5] END learning_rate=0.01, max_depth=50, min_samples_leaf=50, n_estimators=500;, score=0.979 total time=   0.6s
[CV 1/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=50;, score=0.991 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=50;, score=0.960 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=50;, score=0.996 total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=50;, score=0.970 total time=   0.1s
[CV 5/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=50;, score=0.954 total time=   0.1s
[CV 1/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=100;, score=0.991 total time=   0.3s
[CV 2/5] END learning_rate=0.01, max_depth=100, min_samples_leaf=10, n_estimators=100;, score=0.961 total time=   0.4s
[CV 3/5] END learning_rate=0.01, max_depth=100, min_sa

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.1, 0.01],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [10, 20, 50],
                         'n_estimators': [50, 100, 200, 500]},
             scoring='roc_auc', verbose=5)

In [54]:
y_predgbc=cv_grid.predict(Xtest)
print(y_predgbc)

[1 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 0 1 1 0 1
 1 0 1 0 1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1
 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1
 0 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 0 1 1 1
 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1
 0 1 1]


# K-nearest Neighbour Classification

In [55]:
from sklearn import datasets
bc=datasets.load_breast_cancer()

In [56]:
X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.3) #splitting 

In [57]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=5)

kn.fit(X_train, y_train)

y_predknn = kn.predict(X_test)
print(y_predknn)

[1 0 0 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0
 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1
 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1]


# Random Forest Classifier

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [59]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = 999)

In [60]:
estimator = RandomForestClassifier(random_state=0, warm_start = True)

In [61]:
rf_tuned_parameters = {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf':
[10, 20, 50]}
cv_grid = GridSearchCV(estimator, param_grid = rf_tuned_parameters, scoring = 'roc_auc', verbose = 5) # 'recall', my_score
cv_grid.fit(Xtrain, ytrain)
y_predrfc=cv_grid.predict(Xtest)
print(y_predrfc)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.998 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.989 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.997 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.967 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=10;, score=0.984 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.994 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.988 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.996 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.961 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=10, min_samples_leaf=20;, score=0.9

[CV 1/5] END criterion=entropy, max_depth=20, min_samples_leaf=50;, score=0.993 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=20, min_samples_leaf=50;, score=0.985 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=20, min_samples_leaf=50;, score=0.998 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=20, min_samples_leaf=50;, score=0.964 total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=20, min_samples_leaf=50;, score=0.983 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=50, min_samples_leaf=10;, score=0.997 total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=50, min_samples_leaf=10;, score=0.992 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=50, min_samples_leaf=10;, score=0.998 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=50, min_samples_leaf=10;, score=0.963 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=50, min_samples_leaf=10;, score=0.990 total time=   0.1s
[CV 1/5] E

# Comparision of all the predictions of all the 4 models

In [62]:
all_predictions=list(zip(y_preddtc,y_predrfc,y_predknn,y_predgbc))
print(all_predictions)


[(1, 0, 1, 1), (1, 0, 0, 0), (1, 1, 0, 1), (0, 1, 1, 0), (0, 1, 0, 1), (0, 1, 1, 0), (1, 0, 1, 1), (0, 0, 1, 1), (1, 0, 0, 1), (1, 1, 0, 1), (1, 0, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1), (1, 1, 1, 0), (0, 1, 1, 0), (1, 1, 0, 1), (1, 1, 1, 1), (1, 0, 0, 0), (1, 1, 1, 1), (0, 0, 1, 1), (1, 0, 0, 0), (1, 1, 1, 1), (0, 1, 1, 0), (1, 1, 1, 1), (0, 0, 1, 0), (1, 1, 1, 1), (0, 1, 0, 1), (0, 1, 0, 1), (0, 1, 0, 0), (0, 1, 0, 1), (0, 1, 1, 0), (1, 0, 1, 1), (0, 1, 0, 0), (0, 0, 1, 1), (1, 0, 1, 1), (1, 1, 1, 0), (0, 0, 0, 1), (1, 1, 0, 1), (1, 0, 1, 0), (1, 1, 1, 1), (1, 1, 1, 0), (1, 1, 1, 1), (1, 1, 0, 0), (1, 1, 0, 1), (1, 1, 1, 1), (0, 1, 0, 1), (1, 1, 1, 0), (1, 0, 1, 1), (0, 1, 1, 0), (1, 1, 1, 0), (0, 0, 1, 1), (1, 1, 1, 1), (1, 0, 0, 1), (1, 1, 1, 0), (1, 1, 1, 1), (1, 1, 1, 1), (1, 0, 0, 1), (0, 1, 0, 0), (1, 1, 0, 1), (1, 1, 0, 1), (1, 0, 1, 1), (1, 0, 1, 1), (0, 1, 1, 1), (1, 0, 1, 0), (1, 0, 0, 1), (1, 0, 0, 1), (1, 0, 1, 1), (0, 1, 1, 0), (1, 1, 0, 1), (1, 1, 1, 1), (1, 1, 1, 1), (1, 1