In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

data= fetch_openml('mnist_784', version=1)#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])

In [5]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in stratSplit.split(dfData[data["feature_names"]], dfData["target"]):
    X_train = dfData[data["feature_names"]].iloc[train_index]
    X_test = dfData[data["feature_names"]].iloc[test_index]
    
    y_train = dfData["target"].iloc[train_index]
    y_test = dfData["target"].iloc[test_index]

In [6]:
from sklearn.model_selection import GridSearchCV

params = [{"penalty":["l1","l2"],"C":[0.1,1,10]}]#,
#          {"penalty":["elasticnet"],"C":[0.1,1,10, 100],"l1_ratio":[0.1,0.3]}] #"max_iter":[100]

logreg_clf_gscv = GridSearchCV(estimator=LogisticRegression(),
                       param_grid=params,
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_gscv.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n_jobs', refit='roc_auc_ovr_weighted',
             return_train_score=True,
             scoring=['accuracy', 'roc_auc_ovr_weighted', 'f1_macro'],
             verbose=0)

In [7]:
print(logreg_clf_gscv.cv_results_.keys())

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_penalty', 'params', 'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy', 'split0_train_accuracy', 'split1_train_accuracy', 'split2_train_accuracy', 'mean_train_accuracy', 'std_train_accuracy', 'split0_test_roc_auc_ovr_weighted', 'split1_test_roc_auc_ovr_weighted', 'split2_test_roc_auc_ovr_weighted', 'mean_test_roc_auc_ovr_weighted', 'std_test_roc_auc_ovr_weighted', 'rank_test_roc_auc_ovr_weighted', 'split0_train_roc_auc_ovr_weighted', 'split1_train_roc_auc_ovr_weighted', 'split2_train_roc_auc_ovr_weighted', 'mean_train_roc_auc_ovr_weighted', 'std_train_roc_auc_ovr_weighted', 'split0_test_f1_macro', 'split1_test_f1_macro', 'split2_test_f1_macro', 'mean_test_f1_macro', 'std_test_f1_macro', 'rank_test_f1_macro', 'split0_train_f1_macro', 'split1_train_f1_macro', 'split2_train_f1_macro', 'mean_train_f1_macro', 

In [8]:
resultsCVDF = pd.DataFrame(logreg_clf_gscv.cv_results_)
print(resultsCVDF.sort_values("mean_fit_time",ascending=True))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0       5.083324      1.852431         0.000000        0.000000     0.1   
4      10.864562      6.677109         0.000000        0.000000      10   
2      21.251525     13.246398         0.000000        0.000000       1   
1      39.736764      3.113752         9.023278        1.686875     0.1   
3      65.418435     11.234301        14.665992        4.720766       1   
5      71.349266      5.904880        13.017945        1.124738      10   

  param_penalty                       params  split0_test_accuracy  \
0            l1  {'C': 0.1, 'penalty': 'l1'}                   NaN   
4            l1   {'C': 10, 'penalty': 'l1'}                   NaN   
2            l1    {'C': 1, 'penalty': 'l1'}                   NaN   
1            l2  {'C': 0.1, 'penalty': 'l2'}              0.916484   
3            l2    {'C': 1, 'penalty': 'l2'}              0.915627   
5            l2   {'C': 10, 'penalty': 'l2'}          

In [9]:
# best_index_ #only with refit for multi-scoring cases
resultsCVDF.iloc[logreg_clf_gscv.best_index_]

mean_fit_time                                            39.7368
std_fit_time                                             3.11375
mean_score_time                                          9.02328
std_score_time                                           1.68688
param_C                                                      0.1
param_penalty                                                 l2
params                               {'C': 0.1, 'penalty': 'l2'}
split0_test_accuracy                                    0.916484
split1_test_accuracy                                    0.913591
split2_test_accuracy                                    0.913318
mean_test_accuracy                                      0.914464
std_test_accuracy                                     0.00143224
rank_test_accuracy                                             1
split0_train_accuracy                                   0.936839
split1_train_accuracy                                   0.937857
split2_train_accuracy    

In [15]:
print(logreg_clf_gscv.best_estimator_) #only with refit
logreg_clf_gscv.best_estimator_.predict(X_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


array(['9', '0', '6', ..., '5', '4', '5'], dtype=object)

In [11]:
print(logreg_clf_gscv.best_score_) #only with refit for multi-scoring cases
print(logreg_clf_gscv.best_params_) #only with refit for multi-scoring cases

0.9923396074241265
{'C': 0.1, 'penalty': 'l2'}


In [12]:
logreg_clf_gscv.get_params()

{'cv': 3,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'auto',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'iid': 'deprecated',
 'n_jobs': -1,
 'param_grid': [{'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}],
 'pre_dispatch': '2*n_jobs',
 'refit': 'roc_auc_ovr_weighted',
 'ret

In [13]:
logreg_clf_gscv.predict(X_train)#only when refit is true

array(['9', '0', '6', ..., '5', '4', '5'], dtype=object)

In [18]:
from sklearn.model_selection import RandomizedSearchCV

logreg_clf_rscv = RandomizedSearchCV(estimator = LogisticRegression(),
                       param_distributions = params,
                        n_iter = 10,# num param settings sampled
                        random_state = None,#if not none uses this integer as seed
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_rscv.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions=[{'C': [0.1, 1, 10],
                                         'penalty': ['l1', 'l2']}],
                   pre_dispatch='2*n_jobs', random_state=None,
                   refit='roc_auc_ovr_weighted', return_train_score=True,
                  

In [19]:
resultsRSDF = pd.DataFrame(logreg_clf_rscv.cv_results_)
print(resultsRSDF.sort_values("mean_fit_time",ascending=True))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_penalty  \
2      24.354877      4.996888         0.000000        0.000000            l1   
4      25.150341      1.536706         0.000000        0.000000            l1   
0      37.539734     23.818937         0.000000        0.000000            l1   
5      78.361490     14.619442        19.910446        7.244386            l2   
1     100.904246     23.412529        30.013027        8.491973            l2   
3     107.371277     35.923855        25.766912        7.818503            l2   

  param_C                       params  split0_test_accuracy  \
2       1    {'penalty': 'l1', 'C': 1}                   NaN   
4      10   {'penalty': 'l1', 'C': 10}                   NaN   
0     0.1  {'penalty': 'l1', 'C': 0.1}                   NaN   
5      10   {'penalty': 'l2', 'C': 10}              0.915841   
1     0.1  {'penalty': 'l2', 'C': 0.1}              0.916484   
3       1    {'penalty': 'l2', 'C': 1}          

In [22]:
params = [{"penalty":["l1"],"C":[1],"solver":["saga"]}]

# params = [{"penalty":["l1"],"C":[0.1,1,10],"solver":["saga"]},
#          {"penalty":["l2"],"C":[0.1,1,10],"solver":["saga"]},
#          {"penalty":["l2"],"C":[0.1,1,10],"solver":["lbfgs"]}]

logreg_clf_gscv = GridSearchCV(estimator=LogisticRegression(),
                       param_grid=params,
                        scoring = ["accuracy","roc_auc_ovr_weighted","f1_macro"],
                    
                       refit="roc_auc_ovr_weighted",#True
                        cv = 3,#If our estimator is classifier automatically do stratified CV
                        n_jobs=-1,#Num CPUs to use for calculation, -1 means all
                        verbose = 0,#Output status updates, higher number-> more messages
                        return_train_score=True#if false our results won't contain training scores
                              )
logreg_clf_gscv.fit(X_train,y_train)



GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [1], 'penalty': ['l1'], 'solver': ['saga']}],
             pre_dispatch='2*n_jobs', refit='roc_auc_ovr_weighted',
             return_train_score=True,
             scoring=['accuracy', 'roc_auc_ovr_weighted', 'f1_macro'],
             verbose=0)

In [26]:
print(pd.DataFrame(logreg_clf_gscv.cv_results_).iloc[0])

mean_fit_time                                                            424.757
std_fit_time                                                             15.2792
mean_score_time                                                          3.72936
std_score_time                                                          0.586606
param_C                                                                        1
param_penalty                                                                 l1
param_solver                                                                saga
params                               {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
split0_test_accuracy                                                    0.917448
split1_test_accuracy                                                     0.91418
split2_test_accuracy                                                    0.913908
mean_test_accuracy                                                      0.915179
std_test_accuracy           