## Deducing best model for digits dataset

In [28]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [29]:
dataset  = load_digits()
X = dataset.data
Y = dataset.target

In [30]:
X = MinMaxScaler().fit(X).transform(X)
# X = PCA(n_components=0.98).fit_transform(X) 

Here we can't use PCA becaue of MultinomialNB where feature values must be positive

In [31]:
# scalar = Pipeline([('Scaler',MinMaxScaler()),('pca',PCA(n_components=0.95))])
# X = scalar.fit_transform(X)
# X.shape

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC

In [33]:
models = {
    'dtc': {
        'model':DecisionTreeClassifier(),
        'params':{}
    },
    'rfc': {
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators': [1,10,100]
        }
    },
    'lreg': {
        'model': LogisticRegression(max_iter=10000),
        'params':{
            'C':[0.1,1,10,100]
        }
    },
    'gnb':{
        'model': GaussianNB(),
        'params':{}
    },
    'mnb':{
        'model': MultinomialNB(),
        'params':{}
    },
    'svm':{
        'model':SVC(),
        'params':{
            'C':[0.1,1,10,100],
            'kernel':['linear','rbf']
        }
    }
}

In [34]:
best_models = []
for model_name,mp in models.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv=StratifiedKFold(n_splits=5,shuffle=True))
    clf.fit(X,Y)
    best_models.append({
        'model':model_name,
        'best_parameters':clf.best_params_,
        'best_score':clf.best_score_
    })
best_models

[{'model': 'dtc', 'best_parameters': {}, 'best_score': 0.8497508511296813},
 {'model': 'rfc',
  'best_parameters': {'n_estimators': 100},
  'best_score': 0.9755199628597957},
 {'model': 'lreg',
  'best_parameters': {'C': 10},
  'best_score': 0.9716171463943052},
 {'model': 'gnb', 'best_parameters': {}, 'best_score': 0.821371092541009},
 {'model': 'mnb', 'best_parameters': {}, 'best_score': 0.8987109254100897},
 {'model': 'svm',
  'best_parameters': {'C': 10, 'kernel': 'rbf'},
  'best_score': 0.9916496440730425}]

In [35]:
pd.DataFrame(best_models)

Unnamed: 0,model,best_parameters,best_score
0,dtc,{},0.849751
1,rfc,{'n_estimators': 100},0.97552
2,lreg,{'C': 10},0.971617
3,gnb,{},0.821371
4,mnb,{},0.898711
5,svm,"{'C': 10, 'kernel': 'rbf'}",0.99165


We can conclude that svm(C=10,kernel=rbf) is best performing model here.