In [2]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz


In [3]:
#Reading data set file

df = pd.read_csv('data/breast-cancer.csv')
print("Shape:\n",df.shape) #tells us how many rows and columns the data structer has got
print(df.info())
df.head()
print("\nUnique Values in 'Diagnosis'", list(df.Diagnosis.unique()))



Shape:
 (569, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          568 non-null    float64
 1   Radius (Mean)               550 non-null    float64
 2   Texture (Mean)              568 non-null    float64
 3   Perimeter (Mean)            567 non-null    float64
 4   Area (Mean)                 566 non-null    float64
 5   Smoothness (Mean)           567 non-null    float64
 6   Compactness (Mean)          566 non-null    float64
 7   Concavity (Mean)            569 non-null    float64
 8   Concave Points (Mean)       569 non-null    float64
 9   Symmetry (Mean)             566 non-null    float64
 10  Fractal Dimension (Mean)    567 non-null    float64
 11  Radius (Error)              551 non-null    float64
 12   Texture (Error)            567 non-null    float64
 13   Perimeter (Error

In [4]:
df = df.drop(columns = ['ID', 'Radius (Mean)', 'Perimeter (Mean)',
        'Smoothness (Mean)', 
        'Concave Points (Mean)', 'Symmetry (Mean)',
       'Fractal Dimension (Mean)', 'Radius (Error)', ' Texture (Error)',
       ' Perimeter (Error)', ' Area (Error)', ' Smoothness (Error)',
       ' Concavity (Error)', ' Concave Points (Error)', ' Compactness (Error)',
       ' Symmetry (Error)', ' Fractal Dimension (Error)', ' Radius (Worst)',
       ' Texture (Worst)', ' Compactness (Worst)', ' Concavity (Worst)',
       'Symmetry (Worst)', 'Fractal Dimension (Worst)'])

print(df.columns)
print(df.info())

Index(['Texture (Mean)', 'Area (Mean)', 'Compactness (Mean)',
       'Concavity (Mean)', ' Perimeter (Worst)', ' Area (Worst)',
       ' Smoothness (Worst)', 'Concave Points (Worst)', 'Diagnosis'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Texture (Mean)          568 non-null    float64
 1   Area (Mean)             566 non-null    float64
 2   Compactness (Mean)      566 non-null    float64
 3   Concavity (Mean)        569 non-null    float64
 4    Perimeter (Worst)      567 non-null    float64
 5    Area (Worst)           566 non-null    float64
 6    Smoothness (Worst)     569 non-null    float64
 7   Concave Points (Worst)  566 non-null    float64
 8   Diagnosis               569 non-null    object 
dtypes: float64(8), object(1)
memory usage: 40.1+ KB
None


In [5]:
from sklearn import preprocessing

df = df.fillna(df.mean())
df.isnull().sum()

df['Diagnosis'].replace('M', 1,inplace=True)
df['Diagnosis'].replace('B', 0,inplace=True)

names = df.columns
scaler = MinMaxScaler() 
df = scaler.fit_transform(df) 
df = pd.DataFrame(df, columns=names)

# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,:5]
y = df['Diagnosis']

# Transform class labels to numeric labels
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

In [6]:
#Nearest Neighbors 
from sklearn.metrics import classification_report

In [7]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold #this seems pretty good one

# Creating the model, training and testing it
k = 5
model = KNeighborsClassifier(n_neighbors=k, p=3, metric='minkowski')

#random_state = 12883823 36851234
rkf = RepeatedStratifiedKFold(n_splits=6, n_repeats=2,
      random_state=4)
scores = cross_validate(model, X, y, cv=rkf, return_train_score=True, return_estimator=True)

y_pred = cross_val_predict(model, X, y, cv=6)

# Printing results
pprint.pprint(scores)
print()

print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred)) 
report = classification_report(y, y_pred)
print(report)

{'estimator': (KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3),
               KNeighborsClassifier(p=3)),
 'fit_time': array([0.0029912 , 0.00299096, 0.00199509, 0.00299191, 0.00299191,
       0.00299191, 0.00299144, 0.00299239, 0.00199437, 0.00199509,
       0.00199437, 0.00299239]),
 'score_time': array([0.00997686, 0.00897527, 0.00997424, 0.00897527, 0.00698137,
       0.0089736 , 0.00797892, 0.00897527, 0.00797796, 0.0079782 ,
       0.0079782 , 0.00698137]),
 'test_score': array([0.89473684, 0.92631579, 0.95789474, 0.93684211, 0.94736842,
       0.94680851, 0.88421053, 0.96842105, 0.936842

In [11]:
# Creating the model, training and testing it
k = 5
model1 = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='manhattan')

#random_state = 12883823 36851234 random_state=12883823
rkf = RepeatedStratifiedKFold(n_splits=4, n_repeats=2)
scores = cross_validate(model1, X, y, cv=rkf, return_train_score=True, return_estimator=True)

y_pred = cross_val_predict(model1, X, y, cv=6)

# Printing results
pprint.pprint(scores)
print()
#print(X_test)
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance')),
 'fit_time': array([0.00398898, 0.00399065, 0.00199747, 0.00399613, 0.00299168,
       0.00299239, 0.00199413, 0.00199604]),
 'score_time': array([0.00299239, 0.00299191, 0.00398827, 0.00298572, 0.0039885 ,
       0.00199461, 0.00199485, 0.00199294]),
 'test_score': array([0.93706294, 0.92253521, 0.92957746, 0.93661972, 0.94405594,
       0.91549296, 0.93661972, 0.95774648]),
 'tr

In [None]:
#Decision Trees 

In [61]:
#rank 1 after grid search 
dt = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_split=2, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8),
               DecisionTreeClassifier(criterion='entropy', max_depth=8)),
 'fit_time': array([0.00398993, 0.00702882, 0.00703764, 0.00593972, 0.0074389 ,
       0.00606322, 0.00404477, 0.00501752]),
 'score_time': array([0.00299096, 0.00299764, 0.0030756 , 0.00199533, 0.00249171,
       0.00199676, 0.00199461, 0.00302958]),
 'test_score': array([0.91608392, 0.95774648, 0.88028169, 0.93661972, 0.8951049 ,
       0.87323944, 0.91549296, 0.94366197]),
 'train_score': array([1.        , 0

In [62]:
#rank 2 
dt1 = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=4, splitter='random', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt1, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt1, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(min_samples_split=4, splitter='random')),
 'fit_time': array([0.00425196, 0.00403595, 0.00312734, 0.00502205, 0.00497556,
       0.00366545, 0.00297284, 0.00199747]),
 'score_time': array([0.00173497, 0.00194764, 0.00185919, 0.00316334, 0.00308943,
       0.00202298, 0.00199389, 0.00299239]),
 'test_score': array([0.94405594, 0.88732394, 0.93661972, 0.95070423, 0.94405594,
       0.90140845, 0.90140845, 0.

In [63]:
#rank 3
dt2 = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=4, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt2, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt2, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4),
               DecisionTreeClassifier(min_samples_split=4)),
 'fit_time': array([0.00596452, 0.00601482, 0.00482321, 0.0039897 , 0.00602317,
       0.00498486, 0.0050211 , 0.00715113]),
 'score_time': array([0.0040648 , 0.0039928 , 0.00199485, 0.0030489 , 0.00297999,
       0.00312996, 0.00306582, 0.00401664]),
 'test_score': array([0.91608392, 0.93661972, 0.88732394, 0.88732394, 0.93006993,
       0.91549296, 0.92253521, 0.91549296]),
 'train_score': array([0.99295775, 0.99765808, 0.99297424, 0.99297424, 0.99295775,
       0.99531616, 0.99765808, 0.9882904 ])}

Accuracy (T

In [64]:
#rank 4
dt3 = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier(),
               DecisionTreeClassifier()),
 'fit_time': array([0.00595951, 0.00521755, 0.00396276, 0.00495768, 0.00401807,
       0.00605679, 0.00801492, 0.00498605]),
 'score_time': array([0.00299454, 0.0029912 , 0.0039773 , 0.00199437, 0.00301528,
       0.00302052, 0.00399256, 0.00402355]),
 'test_score': array([0.9020979 , 0.96478873, 0.91549296, 0.92957746, 0.94405594,
       0.9084507 , 0.95774648, 0.9084507 ]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1.])}

Accuracy (Training): 1.00 (+/- 0.00)
Accuracy (Testing):  0.93 (+/- 0.05)
[[338  20]
 [ 23 188]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       358
           1       0.90     

In [65]:
#Random Forest

In [66]:
#creterion specified as in the other one should be gini by default
rft3 = RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=5, random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(rft3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(rft3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5),
               RandomForestClassifier(criterion='entropy', max_depth=5)),
 'fit_time': array([0.26045775, 0.28418899, 0.26108003, 0.3148303 , 0.34177876,
       0.35049009, 0.32511854, 0.29185295]),
 'score_time': array([0.01998448, 0.01994705, 0.02500582, 0.01495433, 0.01400256,
       0.02124858, 0.01621723, 0.01592278]),
 'test_score': array([0.95104895, 0.93661972, 0.95774648, 0.90140845, 0.95804196,
       0.9084507 , 0.95774648, 0.92957746]),
 'train_score': array([0.98826291, 0

In [67]:
#creterion specified as in the other one should be gini by default
rft3 = RandomForestClassifier(criterion='gini', n_estimators=100, max_depth=15, random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(rft3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(rft3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15),
               RandomForestClassifier(max_depth=15)),
 'fit_time': array([0.29787898, 0.34595799, 0.29820919, 0.32582974, 0.26638055,
       0.31043911, 0.28410602, 0.3074584 ]),
 'score_time': array([0.02294254, 0.02293539, 0.0179503 , 0.01954937, 0.01841187,
       0.02812076, 0.01797652, 0.02505493]),
 'test_score': array([0.96503497, 0.92253521, 0.9084507 , 0.93661972, 0.95804196,
       0.94366197, 0.8943662 , 0.95774648]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1.])}

Accuracy (Training): 1.00 (+/- 0.00)
Accuracy (Testing):  0.94 (+/- 0.05)
[[344  14]
 [ 19 192]]
              precision    recall  f1-sco

In [68]:
#DT VS RF

In [69]:
dt_acc = np.array([])
rf_acc = np.array([])
runs = 100

for x in range(runs):
    
    # splitting the dataset each run, just to get more variation in the results for this example
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=df['Diagnosis'])
    
    # DT with default config
    dt = DecisionTreeClassifier(random_state=None)
    dt = dt.fit(X_train,y_train)
    y_pred = dt.predict(X_test)
    dt_accuracy = np.append(dt_acc, metrics.accuracy_score(y_test, y_pred))
    
    # Random Forest with 10 trees (and keeping other config as per default values)
    rf = RandomForestClassifier(random_state=None)
    rf = rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    rf_accuracy = np.append(rf_acc, metrics.accuracy_score(y_test, y_pred))

print("Done...")
print(" - DT accuracy: %0.2f (+/- %0.2f)" % (dt_accuracy.mean(), dt_accuracy.std() * 2))
print(" - RF accuracy: %0.2f (+/- %0.2f)" % (rf_accuracy.mean(), rf_accuracy.std() * 2))


Done...
 - DT accuracy: 0.92 (+/- 0.00)
 - RF accuracy: 0.97 (+/- 0.00)


In [70]:
#Naive Bayes

In [71]:
#NB Model 1
gaussianNB = GaussianNB()

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(gaussianNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(gaussianNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB()),
 'fit_time': array([0.00718141, 0.0052309 , 0.00545096, 0.00299239, 0.00698233,
       0.00419664, 0.00607252, 0.00660396, 0.00552034, 0.00291872,
       0.00350595, 0.0040679 , 0.00395131, 0.00495505, 0.00398946,
       0.00294876, 0.00408459, 0.0040226 , 0.00402761, 0.00598717]),
 'score_time': array([0.00482655, 0.00500798, 0.00258493, 0.00299168, 0.00414538,
       0.00299454, 0.00513577, 0.00397205, 0.00199556,

In [76]:
#NB Model 2
multinomialNB = MultinomialNB()

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(multinomialNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(multinomialNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB()),
 'fit_time': array([0.00557232, 0.00555015, 0.00408149, 0.0048182 , 0.00398707,
       0.00398779, 0.004987  , 0.00594354, 0.00604057, 0.00399494,
       0.004987  , 0.00405002, 0.00398827, 0.00381804, 0.00598049,
       0.00511122, 0.00475812, 0.00429702, 0.00552082, 0.00398827]),
 'score_time': array([0.00253344, 0.0031774 , 0.00215054, 0.00299191, 0.0029

  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
#NB model 3
bernoulliNB = BernoulliNB()
rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(bernoulliNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(bernoulliNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB()),
 'fit_time': array([0.00494933, 0.00594473, 0.00556064, 0.0049715 , 0.00496817,
       0.004076  , 0.00506234, 0.00702453, 0.00495195, 0.00519919,
       0.00518751, 0.0040791 , 0.00508189, 0.00498796, 0.00498581,
       0.00398898, 0.0039897 , 0.00303268, 0.00498533, 0.00402617]),
 'score_time': array([0.00305557, 0.00398898, 0.00350881, 0.00395727, 0.00202346,
       0.00307703, 0.00505066, 0.0

In [74]:
#All 3 NB comparison 

In [75]:
gaussian = np.array([])
muiltinomial = np.array([])
bernoulli = np.array([])

runs = 100
for x in range(runs):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=df['Diagnosis'])
    
    # Gaussian NB
    gaussianNB = GaussianNB()
    gaussianNB = gaussianNB.fit(X_train,y_train)
    y_pred = gaussianNB.predict(X_test)
    gaussian = np.append(gaussian, metrics.accuracy_score(y_test, y_pred))
    
    # Multinomial NB
    multinomialNB = MultinomialNB()
    multinomialNB = multinomialNB.fit(X_train,y_train)
    y_pred = multinomialNB.predict(X_test)
    muiltinomial = np.append(muiltinomial, metrics.accuracy_score(y_test, y_pred))
    
    # Bernoulli NB
    bernoulliNB = BernoulliNB()
    bernoulliNB = bernoulliNB.fit(X_train,y_train)
    y_pred = bernoulliNB.predict(X_test)
    bernoulli = np.append(bernoulli, metrics.accuracy_score(y_test, y_pred))


print(" Gaussian accuracy:    %0.2f (+/- %0.2f)" % (gaussian.mean(), gaussian.std() * 2))
print(" Multinomial accuracy: %0.2f (+/- %0.2f)" % (muiltinomial.mean(), muiltinomial.std() * 2))
print(" Bernoulli accuracy:   %0.2f (+/- %0.2f)" % (bernoulli.mean(), bernoulli.std() * 2))


 Gaussian accuracy:    0.92 (+/- 0.05)
 Multinomial accuracy: 0.63 (+/- 0.00)
 Bernoulli accuracy:   0.63 (+/- 0.01)
