<a href="https://colab.research.google.com/github/NosenkoArtem/MADMO-advanced-/blob/master/%22%5Bseminar%5D_ensemble_tutorial_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
#read in the dataset
df = pd.read_csv('diabetes_data.csv')

#take a look at the data
df.head(5)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#split data into inputs and targets
X = df.drop(columns = ['diabetes'])
y = df['diabetes']

In [None]:
score_knn, score_rf, score_svm, score_log_reg = [], [], [], []
score_ensemble = []
for seed in range(10):
  #split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, 
                                                    stratify=y, random_state=seed)

  #create new a knn model
  knn = KNeighborsClassifier()
  #create a dictionary of all values we want to test for n_neighbors
  params_knn = {'n_neighbors': np.arange(1, 50)}
  #use gridsearch to test all values for n_neighbors
  knn_gs = GridSearchCV(knn, params_knn, cv=5)
  #fit model to training data
  knn_gs.fit(X_train, y_train)
  #save best model
  knn_best = knn_gs.best_estimator_
  #check best n_neigbors value
  # print(knn_gs.best_params_)

  #create a new rf classifier
  rf = RandomForestClassifier()
  #create a dictionary of all values we want to test for n_estimators
  params_rf = {'n_estimators': [10, 50, 100, 200, 250]}
  #use gridsearch to test all values for n_estimators
  rf_gs = GridSearchCV(rf, params_rf, cv=5)
  #fit model to training data
  rf_gs.fit(X_train, y_train)
  #save best model
  rf_best = rf_gs.best_estimator_
  #check best n_estimators value
  # print(rf_gs.best_params_)

  #create a new SVM model
  pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC(max_iter=1e6, probability=True))])
  #create a dictionary of all values we want to test for n_estimators
  params_svm = {'svc__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
               'svc__C' : np.logspace(-1, 2, 10)}
  #use gridsearch to test all values for n_estimators           
  svm_gs = GridSearchCV(pipeline, params_svm, cv=5)
  #fit the model to the training data
  svm_gs.fit(X_train, y_train)
  #save best model
  svm_best = svm_gs.best_estimator_
  #check best n_estimators value
  print(svm_gs.best_params_)

  #create a new log-reg model
  log_reg = LogisticRegression(max_iter=1e5)

  #fit the model to the training data
  log_reg.fit(X_train, y_train)

  #test the four models with the test data and print their accuracy scores  
  score_knn.append(knn_best.score(X_test, y_test))
  score_rf.append(rf_best.score(X_test, y_test))
  score_svm.append(svm_best.score(X_test, y_test))
  score_log_reg.append(log_reg.score(X_test, y_test))

  print('knn: {}'.format(knn_best.score(X_test, y_test)))
  print('rf: {}'.format(rf_best.score(X_test, y_test)))
  print('svm: {}'.format(svm_best.score(X_test, y_test)))
  print('log_reg: {}'.format(log_reg.score(X_test, y_test)))

  #create a list of our models
  estimators=[('rf', rf_best), ('svm', svm_best ), ('log_reg', log_reg)]
  #create our voting classifier, inputting our models
  ensemble = VotingClassifier(estimators, voting='soft')
  #fit model to training data
  ensemble.fit(X_train, y_train)
  score_ensemble.append(ensemble.score(X_test, y_test))



{'svc__C': 0.46415888336127786, 'svc__kernel': 'sigmoid'}
knn: 0.71875
rf: 0.7708333333333334
svm: 0.765625
log_reg: 0.7734375




{'svc__C': 0.46415888336127786, 'svc__kernel': 'rbf'}
knn: 0.7083333333333334
rf: 0.7395833333333334
svm: 0.7265625
log_reg: 0.7369791666666666




{'svc__C': 0.1, 'svc__kernel': 'linear'}
knn: 0.7239583333333334
rf: 0.734375
svm: 0.7682291666666666
log_reg: 0.7760416666666666
{'svc__C': 0.1, 'svc__kernel': 'linear'}
knn: 0.7630208333333334
rf: 0.7447916666666666
svm: 0.7682291666666666
log_reg: 0.765625
{'svc__C': 0.21544346900318834, 'svc__kernel': 'rbf'}
knn: 0.6979166666666666
rf: 0.7005208333333334
svm: 0.7395833333333334
log_reg: 0.7578125




{'svc__C': 2.1544346900318834, 'svc__kernel': 'linear'}
knn: 0.75
rf: 0.7552083333333334
svm: 0.78125
log_reg: 0.7760416666666666




{'svc__C': 2.1544346900318834, 'svc__kernel': 'rbf'}
knn: 0.7526041666666666
rf: 0.7447916666666666
svm: 0.7447916666666666
log_reg: 0.7838541666666666
{'svc__C': 0.1, 'svc__kernel': 'linear'}
knn: 0.7239583333333334
rf: 0.7552083333333334
svm: 0.75
log_reg: 0.7552083333333334




{'svc__C': 0.21544346900318834, 'svc__kernel': 'linear'}
knn: 0.7291666666666666
rf: 0.7057291666666666
svm: 0.7630208333333334
log_reg: 0.7604166666666666




{'svc__C': 2.1544346900318834, 'svc__kernel': 'rbf'}
knn: 0.7239583333333334
rf: 0.7734375
svm: 0.7369791666666666
log_reg: 0.7578125


In [None]:
print('knn score mean+/-std: {:.2f}+/-{:.2f}'.format(np.mean(score_knn), np.std(score_knn)))
print('Log reg score mean+/-std: {:.2f}+/-{:.2f}'.format(np.mean(score_log_reg), np.std(score_log_reg)))
print('svm score mean+/-std: {:.2f}+/-{:.2f}'.format(np.mean(score_svm), np.std(score_svm)))
print('rf score mean+/-std: {:.2f}+/-{:.2f}'.format(np.mean(score_rf), np.std(score_rf)))
print('ensemble score mean+/-std: {:.2f}+/-{:.2f}'.format(np.mean(score_ensemble), np.std(score_ensemble)))

knn score mean+/-std: 0.73+/-0.02
Log reg score mean+/-std: 0.76+/-0.01
svm score mean+/-std: 0.75+/-0.02
rf score mean+/-std: 0.74+/-0.02
ensemble score mean+/-std: 0.77+/-0.01
