## Finding the best model and hyper parameters for sklearn digits data classification

In [2]:
from sklearn.datasets import load_digits

In [3]:
data = load_digits()


In [4]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
import numpy as np
n_estimators = [int(x) for x in np.linspace(start= 200, stop= 2000, num=10)]
max_features = ["auto", "sqrt", "log2"]
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
min_samples_split = [2,5,10,14]
min_samples_leaf = [1,2,4,6,8]


random_grid = {"n_estimators":n_estimators, 
               "max_depth":max_depth, 
               "min_samples_split":min_samples_split, 
               "min_samples_leaf":min_samples_leaf   }

In [6]:
model_param = {
    'svm':{'model': svm.SVC(gamma= 'auto'),
            'params': {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                       'C': [1,10],
                       'class_weight': ['balanced']
                
            }},
    'random_forest': {'model': RandomForestClassifier(),
                      'params': random_grid
        
    },
    'logistics_regression': {'model':LogisticRegression(),
                             'params':{'fit_intercept': [True, False],
                                        'solver': ['lbfgs', 'liblinear']
                                         
                                 
                             }
        
    },
    'Gaussian': {'model': GaussianNB(),
                 'params': {'var_smoothing':[0, 1, 2, 3, 5, 9]
                     
                 }
        
    },
    'MultinomialNB': {'model': MultinomialNB(),
                      'params':{'alpha':[1.0, 2.0, 3.0],
                                'force_alpha': [True, False]
                          
                      }
        
    },
    'Decision_Tree': {'model':DecisionTreeClassifier(),
                      'params': {'criterion': ['gini', 'entropy', 'log_loss'],
                                 'splitter': ['best', 'random'],
                          
                      }
                     }
}

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [8]:
scores = []

for model_name, np in model_param.items():
    clf = RandomizedSearchCV(np['model'], np['params'], n_iter= 4, cv=5, verbose=2, n_jobs=-1)
    clf.fit(data.data, data.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_param': clf.best_params_
    })
    
scores
    

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[{'model': 'svm',
  'best_score': 0.96884246363355,
  'best_param': {'kernel': 'poly', 'class_weight': 'balanced', 'C': 10}},
 {'model': 'random_forest',
  'best_score': 0.9404781801299908,
  'best_param': {'n_estimators': 600,
   'min_samples_split': 2,
   'min_samples_leaf': 1,
   'max_depth': 340}},
 {'model': 'logistics_regression',
  'best_score': 0.9221138966264315,
  'best_param': {'solver': 'liblinear', 'fit_intercept': True}},
 {'model': 'Gaussian',
  'best_score': 0.8742355307954194,
  'best_param': {'var_smoothing': 2}},
 {'model': 'MultinomialNB',
  'best_score': 0.8720210461157537,
  'best_param': {'force_alpha': False, 'alpha': 3.0}},
 {'model': 'Decision_Tree',
  'best_score': 0.8063556174558959,
  'best_param': {'splitter': 'best', 'criterion': 'entropy'}}]

In [9]:
print(clf.best_estimator_)

DecisionTreeClassifier(criterion='entropy')


In [None]:
scores = []

for model_name, np in model_param.items():
    clf = GridSearchCV(np['model'], np['params'],cv=2, return_train_score = False)
    clf.fit(data.data, data.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_param': clf.best_params_
    })
    
scores