In [1]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz


In [2]:
#Reading data set file

df = pd.read_csv('data/breast-cancer.csv')
print("Shape:\n\n",df.shape) #tells us how many rows and columns the data structer has got
print(df.info())
df.head()
print("\nUnique Values in 'Diagnosis'", list(df.Diagnosis.unique()))

Shape:

 (569, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          568 non-null    float64
 1   Radius (Mean)               550 non-null    float64
 2   Texture (Mean)              568 non-null    float64
 3   Perimeter (Mean)            567 non-null    float64
 4   Area (Mean)                 566 non-null    float64
 5   Smoothness (Mean)           567 non-null    float64
 6   Compactness (Mean)          566 non-null    float64
 7   Concavity (Mean)            569 non-null    float64
 8   Concave Points (Mean)       569 non-null    float64
 9   Symmetry (Mean)             566 non-null    float64
 10  Fractal Dimension (Mean)    567 non-null    float64
 11  Radius (Error)              551 non-null    float64
 12   Texture (Error)            567 non-null    float64
 13   Perimeter (Erro

In [3]:
from sklearn import preprocessing

df = df.fillna(df.mean())
df.isnull().sum()

df['Diagnosis'].replace('M', 1,inplace=True)
df['Diagnosis'].replace('B', 0,inplace=True)

names = df.columns
scaler = MinMaxScaler() 
df = scaler.fit_transform(df) 
df = pd.DataFrame(df, columns=names)

# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,1:31]
y = df['Diagnosis']

# Transform class labels to numeric labels
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

In [4]:
#Nearest Neigbors
from sklearn.metrics import classification_report

In [5]:
#KNN Model 1
from sklearn.model_selection import RepeatedStratifiedKFold #this seems pretty good one
from sklearn.model_selection import cross_val_predict
# Creating the model, training and testing it
k = 5
model = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski')

#random_state = 12883823 36851234
rkf = RepeatedStratifiedKFold(n_splits=6, n_repeats=2,
      random_state=4)
scores = cross_validate(model, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(model, X, y, cv=6)

# Printing results
pprint.pprint(scores)
print()

print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier()),
 'fit_time': array([0.00398803, 0.0039885 , 0.0039897 , 0.00398946, 0.00399041,
       0.00299191, 0.00398946, 0.0039885 , 0.00398898, 0.00398898,
       0.0039885 , 0.00398827]),
 'score_time': array([0.00797749, 0.00598383, 0.00698137, 0.00598407, 0.00698256,
       0.00698137, 0.00598359, 0.00598478, 0.00598431, 0.00598478,
       0.00598526, 0.00598454]),
 'test_score': array([0.94736842, 0.96842105, 0.97894737, 0.95789474, 0.98947368,
       0.9787234 , 0.95789474, 0.97894737, 0.96842105, 0.97894737,
       0.95789474, 0

In [6]:
#KNN model 2
# Creating the model, training and testing it
k = 5
model1 = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='manhattan')

#random_state = 12883823 36851234 random_state=12883823
rkf = RepeatedStratifiedKFold(n_splits=4, n_repeats=2)
scores = cross_validate(model1, X, y, cv=rkf, return_train_score=True, return_estimator=True)

y_pred = cross_val_predict(model1, X, y, cv=6)
# Printing results
pprint.pprint(scores)
print()
#print(X_test)
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance'),
               KNeighborsClassifier(metric='manhattan', weights='distance')),
 'fit_time': array([0.00299144, 0.00398445, 0.00398922, 0.00398946, 0.00299168,
       0.00398946, 0.00299239, 0.00398946]),
 'score_time': array([0.00498629, 0.00498462, 0.00598431, 0.00498748, 0.00498676,
       0.00398898, 0.00498629, 0.00398922]),
 'test_score': array([0.99300699, 0.96478873, 0.97183099, 0.94366197, 0.95104895,
       0.98591549, 0.95774648, 0.96478873]),
 'tr

In [7]:
#Decision Trees with hyper parameter optimization 

In [8]:
#rank 1 after grid search 
dt = DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10, min_samples_split=2, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10),
               DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10)),
 'fit_time': array([0.00498629, 0.00399041, 0.00597191, 0.00398898, 0.00398946,
       0.00299168, 0.00398946, 0.00399208]),
 'score_time': array([0.00199461, 0.00299358, 0.00299263, 0.00099707, 0.00099659,
       0.00199437, 0.00199461, 0.00199199]),
 'test_score': array([0.902

In [14]:
#rank 2 
dt1 = DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=8, min_samples_split=4, splitter='random', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt1, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt1, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random'),
               DecisionTreeClassifier(max_features=8, min_samples_split=4, splitter='random')),
 'fit_time': array([0.00301981, 0.00199604, 0.00299263, 0.00399041, 0.00298262,
       0.00099707, 0.00199413, 0.00299335]),
 'score_time': array([0.00100899, 0.00199294, 0.00201058, 0.00099659, 0.00099754,
       0.00199485, 0.00099778, 0

In [15]:
#rank 3
dt2 = DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=8, min_samples_split=4, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt2, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt2, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4),
               DecisionTreeClassifier(max_features=8, min_samples_split=4)),
 'fit_time': array([0.00497389, 0.00397825, 0.00495815, 0.00299215, 0.00299811,
       0.00396681, 0.00298691, 0.00299168]),
 'score_time': array([0.0009973 , 0.00196648, 0.00199604, 0.0009973 , 0.00099635,
       0.00200629, 0.00099802, 0.00195742]),
 'test_score': array([0.93006993, 0.88732394, 0.92957746, 0.93661972, 0.95104895,
       0.91549296, 0.93661972, 0.95774648]),
 'train_scor

In [16]:
#rank 4
dt3 = DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=10, min_samples_split=2, splitter='best', random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(dt3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(dt3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10),
               DecisionTreeClassifier(max_features=10)),
 'fit_time': array([0.00401831, 0.00500846, 0.00299335, 0.00402713, 0.00402617,
       0.00303078, 0.00396442, 0.00300574]),
 'score_time': array([0.0029645 , 0.00096011, 0.00196099, 0.00099635, 0.00096083,
       0.00199294, 0.00102162, 0.00099778]),
 'test_score': array([0.94405594, 0.93661972, 0.9084507 , 0.91549296, 0.94405594,
       0.92957746, 0.92253521, 0.88028169]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1.])}

Accuracy (Training): 1.00 (+/- 0.00)
Accuracy (Testing):  0.92 (+/- 0.04)
[[336  22]
 [ 20 191]]
              pre

In [17]:
#Random Forest

In [18]:
#Model1
#creterion specified as in the other one should be gini by default
rft3 = RandomForestClassifier(criterion='gini', n_estimators=100, max_features=10, max_depth=15, random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(rft3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(rft3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10),
               RandomForestClassifier(max_depth=15, max_features=10)),
 'fit_time': array([0.18252158, 0.18454432, 0.18351841, 0.18948269, 0.20844126,
       0.2034936 , 0.21741986, 0.23341274]),
 'score_time': array([0.00893641, 0.00798535, 0.00797296, 0.01193094, 0.00997257,
       0.01392436, 0.00897551, 0.01395917]),
 'test_score': array([0.97902098, 0.93661972, 0.94366197, 0.97183099, 0.97902098,
       0.9084507 , 0.97183099, 0.95774648]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1.])}

Ac

In [19]:
#Model 2
#creterion specified as in the other one should be gini by default
rft3 = RandomForestClassifier(criterion='entropy', n_estimators=100, max_features=10, max_depth=15, random_state=None)

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
scores = cross_validate(rft3, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(rft3, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10),
               RandomForestClassifier(criterion='entropy', max_depth=15, max_features=10)),
 'fit_time': array([0.21143436, 0.23237634, 0.24830198, 0.21043587, 0.2074523 ,
       0.26333547, 0.22842932, 0.22743511]),
 'score_time': array([0.0100069 , 0.01292944, 0.00897717, 0.00798225, 0.00898099,
       0.01193118, 0.00898123, 0.00899029]),
 'test_score': arra

In [20]:
#Decision tree vs Random Fortes

In [1]:
dt_acc = np.array([])
rf_acc = np.array([])
runs = 100

for x in range(runs):
    
    # splitting the dataset each run, just to get more variation in the results for this example
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=df['Diagnosis'])
    
    # DT model 1
    dt = DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=10, min_samples_split=2, splitter='best', random_state=None)
    dt = dt.fit(X_train,y_train)
    y_pred = dt.predict(X_test)
    dt_accuracy = np.append(dt_acc, metrics.accuracy_score(y_test, y_pred))
    
    # Random Forest with 10 trees (model 2)
    rf = RandomForestClassifier(criterion='entropy', n_estimators=100, max_features=10, max_depth=15, random_state=None)
    rf = rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    rf_accuracy = np.append(rf_acc, metrics.accuracy_score(y_test, y_pred))

print("Done...")
print(" - DT accuracy: %0.2f (+/- %0.2f)" % (dt_accuracy.mean(), dt_accuracy.std() * 2))
print(" - RF accuracy: %0.2f (+/- %0.2f)" % (rf_accuracy.mean(), rf_accuracy.std() * 2))


NameError: name 'np' is not defined

In [22]:
#model1
gaussianNB = GaussianNB()

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(gaussianNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(gaussianNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB(),
               GaussianNB()),
 'fit_time': array([0.00498891, 0.002985  , 0.00498319, 0.00299573, 0.00398827,
       0.00299191, 0.00299072, 0.00199437, 0.00199461, 0.00199413,
       0.00299168, 0.00199413, 0.00199509, 0.00199604, 0.00299001,
       0.00198364, 0.00299144, 0.00302196, 0.00199246, 0.00302529]),
 'score_time': array([0.00303268, 0.00199604, 0.00199509, 0.00298882, 0.00199604,
       0.00199604, 0.00199533, 0.00199533, 0.00199461,

In [23]:
#Model 2
multinomialNB = MultinomialNB()

rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(multinomialNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(multinomialNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB(),
               MultinomialNB()),
 'fit_time': array([0.0099721 , 0.00202179, 0.00398803, 0.00299048, 0.00199437,
       0.00398922, 0.00199413, 0.00296497, 0.00199795, 0.00199556,
       0.00299883, 0.00199461, 0.00199294, 0.00196075, 0.00199819,
       0.00204587, 0.00199699, 0.00195408, 0.00199485, 0.00299048]),
 'score_time': array([0.00199461, 0.00100327, 0.00200248, 0.0009973 , 0.0019

In [24]:
#Model 3
bernoulliNB = BernoulliNB()
rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=5)
scores = cross_validate(bernoulliNB, X, y, cv=rkf, return_train_score=True, return_estimator=True)
y_pred = cross_val_predict(bernoulliNB, X, y, cv=6)

pprint.pprint(scores)
print()
print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
print(confusion_matrix(y, y_pred))
report = classification_report(y, y_pred)
print(report)

{'estimator': (BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB(),
               BernoulliNB()),
 'fit_time': array([0.00498629, 0.00299239, 0.00299215, 0.00399041, 0.00302649,
       0.00299239, 0.0020113 , 0.00199389, 0.00296617, 0.00196624,
       0.00299191, 0.00402331, 0.00203061, 0.00298834, 0.00399876,
       0.00497818, 0.00199413, 0.00299239, 0.00299001, 0.0039525 ]),
 'score_time': array([0.00398922, 0.00099826, 0.00099754, 0.00199676, 0.00099206,
       0.00196528, 0.0019865 , 0.0

In [25]:
#All three NB compared

In [26]:
gaussian = np.array([])
muiltinomial = np.array([])
bernoulli = np.array([])

runs = 100
for x in range(runs):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=df['Diagnosis'])
    
    # Gaussian NB
    gaussianNB = GaussianNB()
    gaussianNB = gaussianNB.fit(X_train,y_train)
    y_pred = gaussianNB.predict(X_test)
    gaussian = np.append(gaussian, metrics.accuracy_score(y_test, y_pred))
    
    # Multinomial NB
    multinomialNB = MultinomialNB()
    multinomialNB = multinomialNB.fit(X_train,y_train)
    y_pred = multinomialNB.predict(X_test)
    muiltinomial = np.append(muiltinomial, metrics.accuracy_score(y_test, y_pred))
    
    # Bernoulli NB
    bernoulliNB = BernoulliNB()
    bernoulliNB = bernoulliNB.fit(X_train,y_train)
    y_pred = bernoulliNB.predict(X_test)
    bernoulli = np.append(bernoulli, metrics.accuracy_score(y_test, y_pred))


print(" Gaussian accuracy:    %0.2f (+/- %0.2f)" % (gaussian.mean(), gaussian.std() * 2))
print(" Multinomial accuracy: %0.2f (+/- %0.2f)" % (muiltinomial.mean(), muiltinomial.std() * 2))
print(" Bernoulli accuracy:   %0.2f (+/- %0.2f)" % (bernoulli.mean(), bernoulli.std() * 2))


 Gaussian accuracy:    0.93 (+/- 0.04)
 Multinomial accuracy: 0.84 (+/- 0.05)
 Bernoulli accuracy:   0.62 (+/- 0.02)
