In [1]:
import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt

In [2]:
dataset=pd.read_csv('diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataset.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
X=dataset.iloc[:,:-1].values
Y=dataset.iloc[:,-1].values


In [5]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
model_params={
    'svm':{
        'model':svm.SVC(gamma='auto',probability=True),
        'params':{
            'C':[1,10,100,1000],
            'kernel':['rbf','linear']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
    }
}

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [7]:
scores=[]
best_estimators={}
for algo,mp in model_params.items():
  
  clf=GridSearchCV(mp['model'],mp['params'],cv=5,return_train_score=False)
  clf.fit(X_train,Y_train)
  scores.append({
      'model':algo,
      'best_score':clf.best_score_,
      'best_params':clf.best_params_
  })
  best_estimators[algo]=clf.best_estimator_

df =pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.762255,"{'C': 100, 'kernel': 'linear'}"
1,random_forest,0.728029,{'n_estimators': 10}
2,logistic_regression,0.760656,{'C': 10}


In [13]:
best_estimators['random_forest'].score(X_test,Y_test)

0.7532467532467533

In [14]:
best_estimators['svm'].score(X_test,Y_test)

0.8051948051948052

In [15]:
best_estimators['logistic_regression'].score(X_test,Y_test)

0.8246753246753247

In [11]:
best_clf=best_estimators['logistic_regression']

In [12]:

import joblib
joblib.dump(best_clf,'Trained_model.pkl')

['Trained_model.pkl']