In [39]:
#importing libraries and classes
import pickle
import numpy
from pandas import read_csv
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform
from sklearn.model_selection import train_test_split
#Dataset used - Pima Indians onset of diabetes dataset
url = 'https://goo.gl/bDdBiA'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
#Numerical Data is standardized using standard scaler
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
#Checking of the dataset is imbalanced
print(Counter(Y))
#Algorithm Evaluation
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))
models.append(('RC', RidgeClassifier()))
models.append(('SVC', SVC()))
#Below algorithms are bagging and boosting ensemble  methods to check if the performance of model is increased
models.append(('Random Forest',RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('Extra Trees',ExtraTreesClassifier(n_estimators=100, max_features=3)))
models.append(('Gradient Boosting',GradientBoostingClassifier(n_estimators=100,learning_rate=0.5,max_features=3)))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7,shuffle=True)
    cv_results = cross_val_score(model, rescaledX, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

#By seeing the results , it is evident to use Logistic regression algorithm as it has best accuracy
##Now  we would improve the accuracy of the Logistic regression algorithm using grid search algorithm tuning methods
CValues = numpy.array([1,10,2,3,4,5,6,7])
param_grid = dict(C=CValues,solver=["newton-cg", "lbfgs", "liblinear", "sag", "saga"])
grid = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid,cv=10)
grid.fit(rescaledX, Y)
print('LR Accuracy using Grid Search ', grid.best_score_)
print('LR Best Parameters using Grid Search', grid.best_params_)
    
#Now  we would improve the accuracy of the Logistic regression algorithm using Random search algorithm tuning methods
##Parameter C is sampled using unifrom sampling method from scipy.stats.distributions
###Solver Saga and newton-cg has similar results in grid search, we are using solver saga as it can be tested with both L1 and L2 penalty.
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2','l1'])
random=RandomizedSearchCV(LogisticRegression(solver='saga',random_state=0), distributions, random_state=0,cv=10)
random.fit(rescaledX, Y)
print('LR Best Accuracy using Random Search', random.best_score_)
print('LR Best Parameters using Random Search', random.best_params_)

#Now  we will use tuning on ensemble algorithm  to check if the accuracy is increased
distributions = dict(n_estimators=[50,60,70,80,90,100,110,120], max_features=[3,4,5,6,7,8])
randomRF=RandomizedSearchCV(RandomForestClassifier(), distributions, random_state=0,cv=10)
randomRF.fit(rescaledX, Y)
print('Random Forest Best Accuracy using Random Search', randomRF.best_score_)
print('Random Forest Best Parameters using Random Search', randomRF.best_params_)

#After Seeing the results, we conclude to use the model logistic regresssion as it shows  best accuracy with C': 2.195254015709299, 'penalty': 'l1', and solver='Saga'
##Let us save the model
filename = 'finalized_model_Diabetes_Dataset.sav'
#We have used the same parameters as we derived from hypertuning of algorithm parameters
Final_model=LogisticRegression(solver='saga',random_state=0,C=2.1952,penalty='l1')
Final_model.fit(rescaledX, Y)
pickle.dump(Final_model, open(filename, 'wb'))

#Load the model
loaded_model = pickle.load(open(filename, 'rb'))
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX, Y, test_size=0.33,random_state=7)
result = loaded_model.score(X_test, Y_test)
print(result)


Counter({0.0: 500, 1.0: 268})
LR: 0.772163 (0.047603)
LDA: 0.766969 (0.047966)
NB: 0.759142 (0.038960)
RC: 0.770865 (0.050661)
SVC: 0.756545 (0.056222)
Random Forest: 0.763038 (0.044072)
Extra Trees: 0.765567 (0.050294)
Gradient Boosting: 0.733237 (0.084247)
LR Accuracy using Grid Search  0.7721804511278195
LR Best Parameters using Grid Search {'C': 1, 'solver': 'newton-cg'}
LR Best Accuracy using Random Search 0.7734791524265209
LR Best Parameters using Random Search {'C': 2.195254015709299, 'penalty': 'l1'}
0.7913385826771654
