In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


**<h1><center>Model Selection and Tunning</center></h1>**

In this file we are going to select the best model which gives good accuracy. And fine tune that model.

**<h2>Contents</h2>**

1. Importing Libraries

2. Loading Data

3. Splitting train and test set

4. Model Training

4. Selecting and fine Tuning that model

5. Saving Model

**<h3>1. Importing Libraries</h3>**

Here we will import all packages which is neccessary for training purpose.(sklearn, pandas, imblearn, XGboost, Catboost.)

**Install packages xgboost and imblearn if it is not installed.**
1. !pip install xgboost
2. !pip install catboost

In [2]:
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import confusion_matrix,classification_report, f1_score

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

import numpy as np

import warnings
warnings.filterwarnings("ignore")

**2. Loading Data**

Here we will load preprocessed data. 

In [3]:
# Reading data set
train_data = pd.read_csv("/content/drive/My Drive/Diabetes Prediction/\
Data Files/processed_diabetes.csv")

train_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,1,89,66,23,94,28.1,0.167,21,0
2,5,116,74,0,0,25.6,0.201,30,0
3,10,115,0,0,0,35.3,0.134,29,0
4,4,110,92,0,0,37.6,0.191,30,0


In [4]:
train_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

**3. Train-Test Splits**

Here we will use Stratified Train-Test Splits 

-  Some classification problems do not have a balanced number of examples for each class label. As such, it is desirable to split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset. This is called a stratified train-test split.

- We can achieve this by setting the “stratify” argument to the y component of the original dataset. This will be used by the train_test_split() function to ensure that both the train and test sets have the proportion of examples in each class that is present in the provided “y” array.

In [5]:
X = train_data.drop(['Outcome'],axis = 1)
y = train_data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=1, 
                                                    stratify=y)

In [6]:
y_test.value_counts()

0    100
1     84
Name: Outcome, dtype: int64

In [7]:
y_train.value_counts()

0    398
1    338
Name: Outcome, dtype: int64

**4. Model Selection**

Here we will load various algorithms and train and we select best algorithms.

In [8]:
import pickle

prediction_data = {
    'Algorithm_name' : [],
    'Accuracy' : [],
    "F1_Score" : [],
    "Cross_validation_mean" : []
}

models = {'Logistic_Regression' : LogisticRegression(),
          "Random_forest": RandomForestClassifier(),
          "Gradient_boosting": GradientBoostingClassifier(),
          "Ada_boost": AdaBoostClassifier(),
          "KNN_algorithm": KNeighborsClassifier(),
          "XGB_boost": XGBClassifier(random_state=6,n_estimators=50)
}

for name, model in models.items():
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    result = model.score(X_test,y_test)

    cross_validation_score = cross_val_score(model,
                                             X_train, 
                                             y_train,
                                             cv=10,
                                             scoring="f1_macro"
                                             )
    prediction_data['Algorithm_name'].append(name)
    prediction_data['Accuracy'].append(result)
    prediction_data['F1_Score'].append(f1_score(y_test, pred))
    prediction_data['Cross_validation_mean'].append(np.mean(
        cross_validation_score))

    folder_path = "/content/drive/My Drive/Diabetes Prediction/Data Files/"
    algo_name = name + ".pkl"

    filepath = folder_path + algo_name

    with open(filepath, "wb") as file_pointer:
        pickle.dump(model, file_pointer)

    print("Accuracy of {} is {}".format(name, result))
    
    print("Cross validation {} mean is {}\n\n".format(name, 
                                                      np.mean(
                                                          cross_validation_score
                                                          )))


Accuracy of Logistic_Regression is 0.782608695652174
Cross validation Logistic_Regression mean is 0.7425992838583333


Accuracy of Random_forest is 0.8532608695652174
Cross validation Random_forest mean is 0.8500795787818936


Accuracy of Gradient_boosting is 0.8043478260869565
Cross validation Gradient_boosting mean is 0.821240174642921


Accuracy of Ada_boost is 0.8206521739130435
Cross validation Ada_boost mean is 0.784161988883991


Accuracy of KNN_algorithm is 0.7554347826086957
Cross validation KNN_algorithm mean is 0.7266549780195775


Accuracy of XGB_boost is 0.8152173913043478
Cross validation XGB_boost mean is 0.7928928833501109




In [9]:
prediction_data = pd.DataFrame(prediction_data)

prediction_data

Unnamed: 0,Algorithm_name,Accuracy,F1_Score,Cross_validation_mean
0,Logistic_Regression,0.782609,0.759036,0.742599
1,Random_forest,0.853261,0.847458,0.85008
2,Gradient_boosting,0.804348,0.788235,0.82124
3,Ada_boost,0.820652,0.797546,0.784162
4,KNN_algorithm,0.755435,0.745763,0.726655
5,XGB_boost,0.815217,0.804598,0.792893
