In [9]:

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from scipy.stats import mode

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

import pickle

#reading data

data = pd.read_csv('https://raw.githubusercontent.com/harshtyagimdr/diabities_detection/master/diabaties.csv')

 

feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','DiabetesPedigreeFunction', 'Insulin', 'BMI', 'Age']

predicted_class = ['Outcome']

 

X = data[feature_columns].values

y = data[predicted_class].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=10, stratify= y)

 

#initializing models

xgb = XGBClassifier(random_state=10 ,  n_estimators =  100)

rf = RandomForestClassifier(random_state=10 ,  n_estimators = 100)

dt = DecisionTreeClassifier(criterion='entropy', max_depth=8)




#training models

xgb.fit(X_train, y_train.ravel())

rf.fit(X_train, y_train.ravel())

dt.fit(X_train, y_train.ravel())

pickle.dump(rf,open('random_forest.pkl','wb'))
pickle.dump(dt,open('decision_tree.pkl','wb'))
pickle.dump(xgb,open('xgboost.pkl','wb'))


#evaluating model performance

xgb_acc = accuracy_score(y_test, xgb.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

dt_acc = accuracy_score(y_test, dt.predict(X_test))


print(f'Accuracy: {xgb_acc}')

print(f'Accuracy: {rf_acc}')

print(f'Accuracy: {dt_acc}')



#making predictions

xgb_pred = xgb.predict(X_test[6].reshape(1, -1))[0]

rf_pred = rf.predict(X_test[6].reshape(1, -1))[0]

dt_pred = dt.predict(X_test[6].reshape(1, -1))[0]

xgb_pred = xgb.predict(np.array([  6,    208,     72,     35,      0,     33.6,     0.171,  50,   ]).reshape(1, -1))[0]

rf_pred = rf.predict([[  6,    208,     72,     35,      0,     33.6,     0.171,  50,   ]])[0]

dt_pred = dt.predict([[  6,    208,     72,     35,      0,     33.6,     0.171,  50,   ]])[0]



acc = accuracy_score(y_test, mode([xgb.predict(X_test),rf.predict(X_test),dt.predict(X_test)])[0][0])


print(f'Ensemble Accuracy: {acc}')

print(xgb_pred,rf_pred,dt_pred)



Accuracy: 0.7792207792207793
Accuracy: 0.8181818181818182
Accuracy: 0.7402597402597403
Ensemble Accuracy: 0.8051948051948052
1 1 1


In [9]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from scipy.stats import mode

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

import pickle

from statistics import median

#reading data

data = pd.read_csv('https://raw.githubusercontent.com/harshtyagimdr/diabities_detection/master/diabaties.csv')

 

feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

predicted_class = ['Outcome']

 

X = data[feature_columns].values

y = data[predicted_class].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=10, stratify= y)

print("Median of Diabetes Pedigree fn is :",median(data['DiabetesPedigreeFunction'].values))


max_acc=0
max_acc_technique=''
max_dt_acc=0
min_dt_acc=100

max_rf_acc=0
min_rf_acc=100
max_xg_acc=0
min_xg_acc=100

max_ens_acc=0
min_ens_acc=100

for i in range(20):
#initializing models

  xgb = XGBClassifier(random_state=10 ,  n_estimators =  100)

  rf = RandomForestClassifier(random_state=10 ,  n_estimators = 100)

  dt = DecisionTreeClassifier(criterion='entropy', max_depth=8)

  

  #training models

  xgb.fit(X_train, y_train.ravel())

  rf.fit(X_train, y_train.ravel())

  dt.fit(X_train, y_train.ravel())

  

  #evaluating model performance

  xgb_acc = accuracy_score(y_test, xgb.predict(X_test))

  rf_acc = accuracy_score(y_test, rf.predict(X_test))

  dt_acc = accuracy_score(y_test, dt.predict(X_test))

  #making predictions

  xgb_pred = xgb.predict(X_test[6].reshape(1, -1))[0]

  rf_pred = rf.predict(X_test[6].reshape(1, -1))[0]

  dt_pred = dt.predict(X_test[6].reshape(1, -1))[0]


  acc = accuracy_score(y_test, mode([xgb.predict(X_test),rf.predict(X_test),dt.predict(X_test)])[0][0])

  max_ens_acc=max(max_ens_acc,acc)
  min_ens_acc=min(min_ens_acc,acc)

  max_dt_acc=max(max_dt_acc,dt_acc)
  min_dt_acc=min(min_dt_acc,dt_acc)

  max_rf_acc=max(max_rf_acc,rf_acc)
  min_rf_acc=min(min_rf_acc,rf_acc)

  max_xg_acc=max(max_xg_acc,xgb_acc)
  min_xg_acc=min(min_xg_acc,xgb_acc)

  max_acc=max([max_dt_acc,max_rf_acc,max_xg_acc,max_ens_acc])

  if(max_acc==max_dt_acc):
    max_acc_technique='Decesion Tree'
  
  if(max_acc==max_rf_acc):
    max_acc_technique='Random Forest'

  if(max_acc==max_xg_acc):
    max_acc_technique='XGBoost'

  if(max_acc==max_ens_acc):
    max_acc_technique='Ensemble'

print(f'Max Decesion Tree Accuracy: {max_dt_acc}')
print(f'Min Decesion Tree Accuracy: {min_dt_acc}')
print()
print(f'Max Random Forest Accuracy: {max_rf_acc}')
print(f'Min Random Forest Accuracy: {min_rf_acc}')
print()
print(f'Max XGBoost Accuracy: {max_xg_acc}')
print(f'Min XGBoost Accuracy: {min_xg_acc}')
print()
print(f'Max Ensemble Accuracy: {max_ens_acc}')
print(f'Min Ensemble Accuracy: {min_ens_acc}')
print()
print(f'Max Accuracy: {max_acc}')
print(f'Max Accuracy Technique: {max_acc_technique}')

Median of Diabetes Pedigree fn is : 0.3725
Max Decesion Tree Accuracy: 0.7922077922077922
Min Decesion Tree Accuracy: 0.7272727272727273

Max Random Forest Accuracy: 0.8181818181818182
Min Random Forest Accuracy: 0.8181818181818182

Max XGBoost Accuracy: 0.8441558441558441
Min XGBoost Accuracy: 0.8441558441558441

Max Ensemble Accuracy: 0.8571428571428571
Min Ensemble Accuracy: 0.8571428571428571

Max Accuracy: 0.8571428571428571
Max Accuracy Technique: Ensemble
