<a href="https://colab.research.google.com/github/Priangshu-18/Model-Selection-in-Machine-Learning/blob/main/Model_Selection_in_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
diabetes_data = pd.read_csv('/content/diabetes.csv')

In [6]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
diabetes_data['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [8]:
X = diabetes_data.drop(columns='Outcome', axis=1)
Y = diabetes_data['Outcome']

In [9]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [10]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [11]:
X = np.asarray(X)
Y = np.asarray(Y)

# Compairing the models with default hyperparameter values

In [12]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [13]:
def compare_models():

  for model in models:
    cv_score = cross_val_score(model, X, Y, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)
    print('Cross validation accuracy for the ', model, ' = ', cv_score)
    print('Accuracy in % of the', model, ' = ', mean_accuracy)
    print('-----------------------------------------------------------')

In [14]:
compare_models()

Cross validation accuracy for the  LogisticRegression(max_iter=1000)  =  [0.77272727 0.74675325 0.75324675 0.81045752 0.77777778]
Accuracy in % of the LogisticRegression(max_iter=1000)  =  77.22
-----------------------------------------------------------
Cross validation accuracy for the  SVC(kernel='linear')  =  [0.75974026 0.75324675 0.74025974 0.81045752 0.76470588]
Accuracy in % of the SVC(kernel='linear')  =  76.57
-----------------------------------------------------------
Cross validation accuracy for the  KNeighborsClassifier()  =  [0.72727273 0.72727273 0.7012987  0.75816993 0.70588235]
Accuracy in % of the KNeighborsClassifier()  =  72.4
-----------------------------------------------------------
Cross validation accuracy for the  RandomForestClassifier(random_state=0)  =  [0.77272727 0.74025974 0.77272727 0.84313725 0.76470588]
Accuracy in % of the RandomForestClassifier(random_state=0)  =  77.87
-----------------------------------------------------------


Here RandomForestClassifier has highest accuracy

# Compairing the models with different hyperparamter values using GridSearchCV

In [15]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [28]:
# Creating a dictionary for hyperparameters values for all the models

parameters = {
    'log_reg_hyperparameters' : {
    'C' : [1, 5, 10, 20]
    },
    'svc_hyperparameters': {
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'C' : [1, 5, 10, 20]
    },
    'KNN_hyperparameters': {
        'n_neighbors': [3,5,10]
    },
    'random_forest_hyperparamters': {
        'n_estimators': [10,20,50,100]
    }
}

In [17]:
type(parameters)

dict

In [29]:
model_keys = list(parameters.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparamters']


In [21]:
model_keys[0]

'Log_reg_hyperparameters'

In [23]:
model_keys[2]

'KNN_hyperparameters'

In [24]:
parameters[model_keys[1]]

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}

Applying GridSearchCV

In [37]:
def ModelSelection(list_of_models, hyperparameters_dictionary):

  result = []

  i = 0

  for model in list_of_models:

    key = model_keys[i]

    params = hyperparameters_dictionary[key]

    i += 1

    print(model)
    print(params)
    print('--------------------------------------------------------')

    classifier = GridSearchCV(model, params, cv=5)

    classifier.fit(X, Y)

    result.append({
        'model_used': model,
        'highest_score': classifier.best_score_,
        'best_paarmeters': classifier.best_params_
    })

  result_dataframe = pd.DataFrame(result, columns=['model_used', 'highest_score', 'best_paarmeters'])

  return result_dataframe


In [38]:
ModelSelection(models, parameters)

LogisticRegression(max_iter=1000)
{'C': [1, 5, 10, 20]}
--------------------------------------------------------
SVC(kernel='linear')
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
--------------------------------------------------------
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
--------------------------------------------------------
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
--------------------------------------------------------


Unnamed: 0,model_used,highest_score,best_paarmeters
0,LogisticRegression(max_iter=1000),0.772193,{'C': 1}
1,SVC(kernel='linear'),0.770885,"{'C': 5, 'kernel': 'linear'}"
2,KNeighborsClassifier(),0.744801,{'n_neighbors': 10}
3,RandomForestClassifier(random_state=0),0.778711,{'n_estimators': 100}
