In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import numpy as mp

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

data = pd.read_csv("diabetes.csv")
features = data.columns
features = [x for x in features if x != 'Outcome']
target = "Outcome"
X = data[features]
Y = data[target]

#number of yes and no in the data set 

one_data = data[data['Outcome']==1]
zero_data = data[data['Outcome']==0]

print(one_data.shape,zero_data.shape)


(268, 9) (500, 9)


In [2]:
#over sampling to balance the number of yes and no outcome 

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros =  RandomOverSampler(sampling_strategy=1) # to produce equal number of row
p,q = ros.fit_resample(X, Y)

print("befor sampling {}".format(Counter(Y)))
print("after sampling {}".format(Counter(q)))
print("over all shape of the data after sampling : ",p.shape,q.shape)

befor sampling Counter({0: 500, 1: 268})
after sampling Counter({1: 500, 0: 500})
over all shape of the data after sampling :  (1000, 8) (1000,)


In [3]:
#fill the missing value with mean value

from sklearn.impute import SimpleImputer
fill_null = SimpleImputer(missing_values=0, strategy = "mean")
without_missing_value = fill_null.fit_transform(p)


In [4]:
x_train, x_test, y_train, y_test = train_test_split(without_missing_value, q, test_size = 0.30)

print(x_train.shape,x_test.shape)

(700, 8) (300, 8)


In [5]:
#Naive Bayes 

nb=GaussianNB()
nb= nb.fit(x_train,y_train)
y_pred=nb.predict(x_test)

#confusion matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print("confusion matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification report : \n",metrics.classification_report(y_test,y_pred))
print("\nAccuracy using Naive Bayes: ", round(accuracy_score(y_test, y_pred)*100), "%" )

confusion matrix : 
 [[104  24]
 [ 55 117]]

Classification report : 
               precision    recall  f1-score   support

           0       0.65      0.81      0.72       128
           1       0.83      0.68      0.75       172

    accuracy                           0.74       300
   macro avg       0.74      0.75      0.74       300
weighted avg       0.75      0.74      0.74       300


Accuracy using Naive Bayes:  74 %


In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kf = KFold(10)
result = cross_val_score(nb,x_train,y_train, cv = kf ) #cv = cross val(how many train and test are there)
print(result)
print("average Accuracy using cross validation : \n",round(mp.mean(result)*100),"%")

NameError: name 'nb' is not defined

In [7]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier(min_samples_split = 100, criterion='entropy')
dt= dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

#confusion matrix
print("confusion matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification report : \n",metrics.classification_report(y_test,y_pred))
print("\nAccuracy using DecisionTree: ", round(accuracy_score(y_test, y_pred)*100), "%" )

confusion matrix : 
 [[107  21]
 [ 45 127]]

Classification report : 
               precision    recall  f1-score   support

           0       0.70      0.84      0.76       128
           1       0.86      0.74      0.79       172

    accuracy                           0.78       300
   macro avg       0.78      0.79      0.78       300
weighted avg       0.79      0.78      0.78       300


Accuracy using DecisionTree:  78 %


In [2]:
result = cross_val_score(dt,x_train,y_train, cv = kf ) #cv = cross val(how many train and test are there)
print(result)
print("average Accuracy using cross validation : \n",round(mp.mean(result)*100),"%")

NameError: name 'dt' is not defined

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=10)
rf = rf.fit(x_train,y_train.ravel())
y_pred=rf.predict(x_test)

#confusion matrix
print("confusion matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification report : \n",metrics.classification_report(y_test,y_pred))
print("\nAccuracy using RandomForestClassifier: ", round(accuracy_score(y_test, y_pred)*100), "%" )

confusion matrix : 
 [[109  19]
 [ 16 156]]

Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       128
           1       0.89      0.91      0.90       172

    accuracy                           0.88       300
   macro avg       0.88      0.88      0.88       300
weighted avg       0.88      0.88      0.88       300


Accuracy using RandomForestClassifier:  88 %


In [10]:
result = cross_val_score(rf,x_train,y_train, cv = kf ) #cv = cross val(how many train and test are there)
print(result)
print("average Accuracy using cross validation : \n",round(mp.mean(result)*100),"%")

[0.82857143 0.72857143 0.77142857 0.85714286 0.82857143 0.75714286
 0.88571429 0.84285714 0.78571429 0.8       ]
average Accuracy using cross validation : 
 81 %


In [11]:
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(estimators = [('nb',nb),('dt',dt),('rf',rf)], voting = 'hard')
model = vc 
vc = vc.fit(x_train,y_train)
y_pred=vc.predict(x_test)

#confusion matrix
print("confusion matrix : \n",confusion_matrix(y_test,y_pred))
print("\nClassification report : \n",metrics.classification_report(y_test,y_pred))
print("\nAccuracy using VotingClassifier: ", round(accuracy_score(y_test, y_pred)*100), "%" )

confusion matrix : 
 [[110  18]
 [ 36 136]]

Classification report : 
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       128
           1       0.88      0.79      0.83       172

    accuracy                           0.82       300
   macro avg       0.82      0.83      0.82       300
weighted avg       0.83      0.82      0.82       300


Accuracy using VotingClassifier:  82 %


In [12]:

from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, x_train,y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(scores)
print("average Accuracy: ",scores.mean()*100,"%")

[0.74285714 0.68571429 0.81428571 0.68571429 0.81428571 0.75714286
 0.71428571 0.78571429 0.82857143 0.78571429 0.74285714 0.81428571
 0.7        0.71428571 0.91428571 0.85714286 0.65714286 0.7
 0.77142857 0.72857143 0.72857143 0.7        0.78571429 0.72857143
 0.67142857 0.77142857 0.78571429 0.77142857 0.84285714 0.82857143]
average Accuracy:  76.09523809523809 %
