### Exercise: Machine Learning Finding Optimal Model and Hyperparameters
#### For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

1. from sklearn import svm
2. from sklearn.ensemble import RandomForestClassifier
3. from sklearn.linear_model import LogisticRegression
4. from sklearn.naive_bayes import GaussianNB
5. from sklearn.naive_bayes import MultinomialNB
6. from sklearn.tree import DecisionTreeClassifier

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
digits = load_digits()

In [4]:
df = pd.DataFrame(data = digits.data, columns=digits.feature_names)
df['target'] = digits.target
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [5]:
X = df.drop('target',axis=1).values
y = df.target

In [6]:
X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [7]:
from sklearn.model_selection import GridSearchCV

### Let's first do it saparately 

1. SVM

In [8]:
parameters = [{
    'C':[1,10,20,30,40,50,60,70,80,90,100],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma':['scale','auto']
}]
svm_clf = GridSearchCV(SVC(),
                       param_grid=parameters,
                       cv = 10,return_train_score=False) 
svm_clf.fit(X,y)

In [9]:
svm_clf.best_params_

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

In [10]:
svm_clf.best_score_

0.9816263190564867

2. RandomForest

In [11]:
parameters = [{
    'n_estimators':[1,10,20,30,40,50,60,70,80,90,100,1000],
    'criterion':['gini','entropy'],
    'max_features':['sqrt','log2',None]
    
}]
rf_clf = GridSearchCV(RandomForestClassifier(),
                      param_grid = parameters,
                      cv = 10,
                      n_jobs = -1
                      )
rf_clf.fit(X,y)

In [12]:
rf_clf.best_params_

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 70}

In [13]:
rf_clf.best_score_

0.9565766604593419

3. Logistic Regression

In [14]:
parameters = [{
    'C':[1,2,3,4,5,6,7,8,9,10,20],
    'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
    'multi_class':['auto','ovr','multinimial']
}]
lg_clf = GridSearchCV(LogisticRegression(max_iter=10000),
                      param_grid=parameters,
                      cv=10,
                      n_jobs = -1)
lg_clf.fit(X,y)

550 fits failed out of a total of 1650.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
550 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 1149, in fit
    multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 92, in _check_multi_class
    raise ValueError(
ValueError: multi_class should be 'multinomial', 'ovr' or 'auto'. Got multinimial.

 0.92153631 

In [15]:
lg_clf.best_params_

{'C': 10, 'multi_class': 'auto', 'solver': 'saga'}

In [16]:
lg_clf.best_score_

0.9293078833022967

4. Decision Tree

In [17]:
parameters = [{'criterion':['gini','entropy'],
    'max_features':['sqrt','log2','auto'],
    'splitter':['best','random']}]

In [18]:
dt_clf = GridSearchCV(DecisionTreeClassifier(),
                      param_grid = parameters,
                      cv=10,n_jobs = -1)

In [19]:
dt_clf.fit(X,y)



In [20]:
dt_clf.best_params_

{'criterion': 'entropy', 'max_features': 'auto', 'splitter': 'best'}

In [21]:
dt_clf.best_score_

0.7891247672253259

5. MultinomialNB

In [27]:
parameters = [{'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000,1010,2000]}]

In [28]:
mlf_clf = GridSearchCV(MultinomialNB(),
                       param_grid = parameters,
                       cv = 10)
mlf_clf.fit(X,y)

In [29]:
mlf_clf.best_params_

{'alpha': 1000}

In [30]:
mlf_clf.best_score_

0.8847889509621354

6. GaussianNB

In [31]:
parameters = [{'var_smoothing': np.logspace(0,-9, num=100)}]

In [35]:
gnb_clf = GridSearchCV(GaussianNB(),
                       param_grid = parameters,
                       scoring='accuracy',
                       cv = 10)
gnb_clf.fit(X,y)

In [36]:
gnb_clf.best_params_

{'var_smoothing': 0.15199110829529336}

In [37]:
gnb_clf.best_score_

0.8959342023587835

In [55]:

best_scores={
    #SVM
    'model':['SVM','RandomForestClassifier','LogisticRegression','DecisionTreeClassifier','MultinomialNB','GaussianNB'],
    'best_prams':[svm_clf.best_params_,rf_clf.best_params_,lg_clf.best_params_,dt_clf.best_params_,mlf_clf.best_params_,gnb_clf.best_params_],
    'best_score': [svm_clf.best_score_,rf_clf.best_score_,lg_clf.best_score_,dt_clf.best_score_,mlf_clf.best_score_,gnb_clf.best_score_]
}
best_scores

{'model': ['SVM',
  'RandomForestClassifier',
  'LogisticRegression',
  'DecisionTreeClassifier',
  'MultinomialNB',
  'GaussianNB'],
 'best_prams': [{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'},
  {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 70},
  {'C': 10, 'multi_class': 'auto', 'solver': 'saga'},
  {'criterion': 'entropy', 'max_features': 'auto', 'splitter': 'best'},
  {'alpha': 1000},
  {'var_smoothing': 0.15199110829529336}],
 'best_score': [0.9816263190564867,
  0.9565766604593419,
  0.9293078833022967,
  0.7891247672253259,
  0.8847889509621354,
  0.8959342023587835]}

In [56]:
pd.DataFrame(best_scores,columns=['model','best_prams','best_score'])

Unnamed: 0,model,best_prams,best_score
0,SVM,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.981626
1,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.956577
2,LogisticRegression,"{'C': 10, 'multi_class': 'auto', 'solver': 'sa...",0.929308
3,DecisionTreeClassifier,"{'criterion': 'entropy', 'max_features': 'auto...",0.789125
4,MultinomialNB,{'alpha': 1000},0.884789
5,GaussianNB,{'var_smoothing': 0.15199110829529336},0.895934


### Let's try all together

In [65]:
paramaters = {
    'svm':{
        'model':SVC(),
        'params':{
            'C':[1,10,20,30,40,50,60,70,80,90,100],
            'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma':['scale','auto']
        }
    },
    'random_forst':{
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators':[1,10,20,30,40,50,60,70,80,90,100,1000],
            'criterion':['gini','entropy'],
            'max_features':['sqrt','log2',None]
        }
    },
    'logistic_regression':{
        'model': LogisticRegression(solver='liblinear',multi_class='auto',max_iter=10000),
        'params': {
            'C': [1,5,10,20,30]
        }
    },
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'max_features':['sqrt','log2','auto'],
            'splitter':['best','random']
        }
    },
    'multinomialnb':{
        'model':MultinomialNB(),
        'params':{
            'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000,1010,2000]
        }
    },
    'gaussiannb':{
        'model': GaussianNB(),
        'params':{
            'var_smoothing': np.logspace(0,-9, num=100)
        }
    }
}

In [66]:
from sklearn.model_selection import GridSearchCV

In [67]:
model_scores = []
for model_name,model_parameters in paramaters.items():
    clf = GridSearchCV(model_parameters['model'],
                       param_grid = model_parameters['params'],
                       cv = 10,
                       n_jobs = -1,
                       scoring='accuracy'
                       )
    clf.fit(X,y)
    model_scores.append({
        'model':model_name,
        'best_score':clf.best_score_,
        'best_params':clf.best_params_
    })
df = pd.DataFrame(model_scores,columns=['model','best_score','best_params'])



In [68]:
df.head()

Unnamed: 0,model,best_score,best_params
0,svm,0.981626,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
1,random_forst,0.956589,"{'criterion': 'entropy', 'max_features': 'log2..."
2,logistic_regression,0.925975,{'C': 1}
3,decision_tree,0.778513,"{'criterion': 'entropy', 'max_features': 'sqrt..."
4,multinomialnb,0.884789,{'alpha': 1000}
