Comparing ml models based on cross validation accuracy

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from IPython import get_ipython
from IPython.display import display
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

read dataset

In [6]:
data = pd.read_csv('Crop_recommendation.csv')

separate label and standardise the remaining columns

In [7]:
x = data.drop(columns='label', axis=1)
print(x)
std = StandardScaler()
x_std = std.fit_transform(x)
print(x_std)

        N   P   K  temperature   humidity        ph    rainfall
0      90  42  43    20.879744  82.002744  6.502985  202.935536
1      85  58  41    21.770462  80.319644  7.038096  226.655537
2      60  55  44    23.004459  82.320763  7.840207  263.964248
3      74  35  40    26.491096  80.158363  6.980401  242.864034
4      78  42  42    20.130175  81.604873  7.628473  262.717340
...   ...  ..  ..          ...        ...       ...         ...
2195  107  34  32    26.774637  66.413269  6.780064  177.774507
2196   99  15  27    27.417112  56.636362  6.086922  127.924610
2197  118  33  30    24.131797  67.225123  6.362608  173.322839
2198  117  32  34    26.272418  52.127394  6.758793  127.175293
2199  104  18  30    23.603016  60.396475  6.779833  140.937041

[2200 rows x 7 columns]
[[ 1.0687974  -0.34455075 -0.1016875  ...  0.47266646  0.04330173
   1.8103605 ]
 [ 0.93332887  0.14061552 -0.14118477 ...  0.39705125  0.73487256
   2.24205791]
 [ 0.25598625  0.04964684 -0.08193887 ...  0.

crop label

In [8]:
y = data['label']
print(y)

0         rice
1         rice
2         rice
3         rice
4         rice
         ...  
2195    coffee
2196    coffee
2197    coffee
2198    coffee
2199    coffee
Name: label, Length: 2200, dtype: object


encoded labels

In [9]:
le = LabelEncoder()
ye = le.fit_transform(y)
print(ye)

[20 20 20 ...  5  5  5]


comparing models

In [20]:
#list of models
models = [LogisticRegression(max_iter=3000), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [21]:
def compare_models_cross_validation3():
  for model in models:
    cv_score = cross_val_score(model, x_std, ye, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)
    print('Cross validation accuracy for the model', model, 'is', cv_score)
    print('Accuracy score of the model', model, 'is', mean_accuracy, '%')
    print(' ')
compare_models_cross_validation3()

Cross validation accuracy for the model LogisticRegression(max_iter=3000) is [0.97272727 0.95454545 0.97727273 0.96818182 0.98409091]
Accuracy score of the model LogisticRegression(max_iter=3000) is 97.14 %
 
Cross validation accuracy for the model SVC() is [0.97954545 0.98409091 0.98181818 0.98181818 0.98409091]
Accuracy score of the model SVC() is 98.23 %
 
Cross validation accuracy for the model KNeighborsClassifier() is [0.96363636 0.97045455 0.97045455 0.97045455 0.98181818]
Accuracy score of the model KNeighborsClassifier() is 97.14 %
 
Cross validation accuracy for the model RandomForestClassifier(random_state=0) is [0.99772727 0.99090909 0.99772727 0.99545455 0.98636364]
Accuracy score of the model RandomForestClassifier(random_state=0) is 99.36 %
 


In [22]:
model_hyperparameters = {
    'log_reg_hyperparametera':{
        'C' : [1,5,10,20]
    },
    'svc_hyperparameters': {
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'C' : [1,5,10,20]
    },
    'KNN_hyperparameters': {
        'n_neighbors' : [3,5,10]
    },
    'random_forest_hyperparameters': {
        'n_estimators' : [10,20,50,100]
    }
}

In [23]:
type(model_hyperparameters)

dict

In [24]:
print(model_hyperparameters.keys())

dict_keys(['log_reg_hyperparametera', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters'])


In [25]:
model_hyperparameters['svc_hyperparameters']

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}

In [26]:
model_keys = list(model_hyperparameters.keys())
print(model_keys)

['log_reg_hyperparametera', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters']


In [27]:
def mod_select(models_list, hyperparameters_dictionary):
  r = []
  i = 0
  for model in models_list:
     k = model_keys[i]
     parameter = hyperparameters_dictionary[k]
     i += 1
     print(model)
     print(parameter)
     print('')

     classif = GridSearchCV(model, parameter, cv=5)
     classif.fit(x_std, ye)
     r.append({
         'model used' : model,
         'highest score' : classif.best_score_,
         'best parameters' : classif.best_params_
     })

  r_df = pd.DataFrame(r, columns = ['model used', 'highest score', 'best parameters'])
  return r_df


In [28]:
mod_select(models, model_hyperparameters)

LogisticRegression(max_iter=3000)
{'C': [1, 5, 10, 20]}

SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}

KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}

RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}



Unnamed: 0,model used,highest score,best parameters
0,LogisticRegression(max_iter=3000),0.978636,{'C': 20}
1,SVC(),0.985909,"{'C': 5, 'kernel': 'rbf'}"
2,KNeighborsClassifier(),0.976364,{'n_neighbors': 3}
3,RandomForestClassifier(random_state=0),0.994545,{'n_estimators': 20}


Random Forest has the highest score