#### comparing l1 regularized logit versus l2 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model

import sklearn
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import fetch_openml

X, Y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)
print(len(X_train), len(Y_train), len(X_test))
print(len(X_train[0]))

35000 35000 35000
784


In [4]:
logit = sklearn.linear_model.LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='liblinear', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
logit.fit(X_train[:1000], Y_train[:1000])
print(logit.score(X_train[0:1000], Y_train[0:1000]))
print(logit.score(X_train, Y_train))
print(logit.score(X_test, Y_test))

1.0
0.8335714285714285
0.8283142857142857


In [5]:
logit = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='liblinear', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
logit.fit(X_train[:1000], Y_train[:1000])
print(logit.score(X_train[0:1000], Y_train[0:1000]))
print(logit.score(X_train, Y_train))
print(logit.score(X_test, Y_test))

1.0
0.8372
0.8267142857142857


In [6]:
logit = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='lbfgs', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
params = {
            'solver': ['liblinear'],
            'penalty': ['l1'],
            'C': [6e-4, 5e-4, 4e-4]
         }
l1_logregcv = sklearn.model_selection.GridSearchCV(logit, params, scoring=None, n_jobs=None, 
                                                iid='deprecated', refit=True, cv=None, verbose=0, 
                                                pre_dispatch='2*n_jobs', return_train_score=False)

l1_logregcv.fit(X_train[:1000], Y_train[:1000])
display(pd.DataFrame(l1_logregcv.cv_results_).sort_values('rank_test_score').head())
print(l1_logregcv.score(X_train[0:1000], Y_train[0:1000]))
print(l1_logregcv.score(X_train, Y_train))
print(l1_logregcv.score(X_test, Y_test))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067643,0.004648,0.000593,0.000484,0.0006,l1,liblinear,"{'C': 0.0006, 'penalty': 'l1', 'solver': 'libl...",0.805,0.8,0.79,0.805,0.765,0.793,0.015033,1
1,0.06064,0.001327,0.000598,0.000489,0.0005,l1,liblinear,"{'C': 0.0005, 'penalty': 'l1', 'solver': 'libl...",0.795,0.805,0.78,0.79,0.77,0.788,0.012083,2
2,0.057646,0.00461,0.000399,0.000489,0.0004,l1,liblinear,"{'C': 0.0004, 'penalty': 'l1', 'solver': 'libl...",0.77,0.76,0.745,0.775,0.765,0.763,0.010296,3


0.859
0.8220571428571428
0.8169142857142857


In [7]:
logit = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='lbfgs', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
params = {
            'solver': ['liblinear'],
            'penalty': ['l2'],
            'C': [ 5e-8, 1e-7]
         }
l2_logregcv = sklearn.model_selection.GridSearchCV(logit, params, scoring=None, n_jobs=None, 
                                                iid='deprecated', refit=True, cv=None, verbose=0, 
                                                pre_dispatch='2*n_jobs', return_train_score=False)

l2_logregcv.fit(X_train[:1000], Y_train[:1000])
display(pd.DataFrame(l2_logregcv.cv_results_).sort_values('rank_test_score').head())
print(l2_logregcv.score(X_train[0:1000], Y_train[0:1000]))
print(l2_logregcv.score(X_train, Y_train))
print(l2_logregcv.score(X_test, Y_test))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.060741,0.0036,0.000796,0.000398,1e-07,l2,liblinear,"{'C': 1e-07, 'penalty': 'l2', 'solver': 'libli...",0.8,0.82,0.79,0.82,0.78,0.802,0.016,1
0,0.050982,0.002215,0.000598,0.000489,5e-08,l2,liblinear,"{'C': 5e-08, 'penalty': 'l2', 'solver': 'libli...",0.785,0.79,0.775,0.805,0.79,0.789,0.009695,2


0.85
0.8226
0.8192571428571429


In [8]:
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 5000
import IPython

In [9]:
chosen_l1 = l1_logregcv.best_estimator_
chosen_l2 = l2_logregcv.best_estimator_
chosen_l1.sparsify()
chosen_l2.sparsify()
display(chosen_l1.coef_)
print(chosen_l1.coef_)

<10x784 sparse matrix of type '<class 'numpy.float64'>'
	with 411 stored elements in Compressed Sparse Row format>

  (0, 185)	0.000658796465853251
  (0, 186)	0.0001551847697164881
  (0, 212)	0.0002954378144834856
  (0, 232)	-0.0003115864280457127
  (0, 351)	-0.006399846471635558
  (0, 353)	-0.0010181112817155397
  (0, 378)	-0.0010024326205309306
  (0, 379)	-0.0017174052823570292
  (0, 381)	-0.00044408501053056393
  (0, 386)	0.00230102262927637
  (0, 406)	-0.0037793024305718416
  (0, 408)	-0.0010337584981581637
  (0, 409)	-0.00347223216864
  (0, 434)	-0.003068792323544469
  (0, 436)	-0.0026535225925181346
  (0, 455)	0.0009612894549161664
  (0, 461)	-0.001243958799584456
  (0, 462)	-0.00043714714494715897
  (0, 464)	-0.0033429268002897325
  (0, 482)	0.0006158399107614884
  (0, 488)	-8.480477619023793e-05
  (0, 489)	-0.0008099377429920558
  (0, 490)	-0.005871950060686973
  (0, 491)	-0.0012952356821026535
  (0, 512)	0.001667626848926752
  :	:
  (9, 354)	0.0011916009850971845
  (9, 381)	0.0009850104974022313
  (9, 429)	0.0008396181023440317
  (9, 462)	-0.0009351515700067087
  (9, 467)	-0.001325826376536

In [79]:
display(chosen_l2.coef_) #need display to show the dataframe when using with in jupyter

print(chosen_l2.coef_)


<10x784 sparse matrix of type '<class 'numpy.float64'>'
	with 6120 stored elements in Compressed Sparse Row format>

  (0, 40)	-8.28305795923897e-07
  (0, 41)	-2.911155696215536e-06
  (0, 42)	-2.212994928054117e-06
  (0, 61)	-3.246283965760574e-07
  (0, 62)	-4.87734140338925e-07
  (0, 63)	-4.044933123302012e-07
  (0, 64)	-6.657181409427136e-07
  (0, 66)	-6.186081171335494e-07
  (0, 67)	-9.548989361500087e-06
  (0, 68)	-1.2704693749322872e-05
  (0, 69)	-6.294560530328271e-06
  (0, 70)	-2.3122815831768498e-05
  (0, 71)	-4.493123649307927e-05
  (0, 72)	-3.97119565118787e-05
  (0, 73)	-1.7912209891735256e-05
  (0, 74)	-1.6586769701371084e-05
  (0, 75)	-7.786550836087714e-06
  (0, 76)	-3.1712021388764506e-06
  (0, 77)	-5.710777864374245e-06
  (0, 78)	-9.506466701610341e-06
  (0, 79)	-6.624130156065581e-06
  (0, 80)	-4.081217631699173e-07
  (0, 89)	-3.279561013991467e-07
  (0, 90)	-7.249216971464635e-07
  (0, 91)	-1.0946206031200626e-06
  :	:
  (9, 736)	-1.2908286073690906e-05
  (9, 737)	2.035235528416246e-06
  (9, 738)	4.5749676504326406e-05
  (9, 739)	3.057374242897491e-05
  (9, 740)	9.873628512850926e-0

#### experimenting with creating mask

In [43]:
logit = sklearn.linear_model.LogisticRegression(penalty='none', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='newton-cg', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
logit.fit(X_train[:1000], Y_train[:1000])
print(logit.score(X_train[0:1000], Y_train[0:1000]))
print(logit.score(X_train, Y_train))
print(logit.score(X_test, Y_test))

  "Setting penalty='none' will ignore the C and l1_ratio "


1.0
0.8567142857142858
0.8566285714285714


In [5]:
display(logit.coef_)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
logit.sparsify()
display(logit.coef_)
print(logit.coef_)

<10x784 sparse matrix of type '<class 'numpy.float64'>'
	with 6120 stored elements in Compressed Sparse Row format>

  (0, 34)	-5.845920736760732e-07
  (0, 35)	-8.358641053438595e-07
  (0, 36)	-7.978874179973279e-07
  (0, 37)	-5.672111530603455e-06
  (0, 38)	-1.583152893112945e-06
  (0, 39)	-8.382886572749282e-07
  (0, 40)	-2.423696242260286e-06
  (0, 41)	-1.588945369216028e-06
  (0, 42)	-2.164543420331665e-06
  (0, 43)	-2.954010452704373e-06
  (0, 44)	-3.895154346041973e-07
  (0, 46)	-5.648095392791253e-07
  (0, 47)	-1.2943551941813296e-06
  (0, 48)	-1.2590545979763837e-06
  (0, 62)	-2.410160303752233e-07
  (0, 63)	0.0011726474279174475
  (0, 64)	0.0002945406447052315
  (0, 65)	-8.140029775300432e-06
  (0, 66)	-5.991081404788263e-06
  (0, 67)	-1.0286553052798281e-05
  (0, 68)	-5.962445055867579e-05
  (0, 69)	-7.265027934625205e-05
  (0, 70)	-0.00010981981219144256
  (0, 71)	-0.0008231987870706076
  (0, 72)	-0.0015886854748027497
  :	:
  (9, 721)	0.003975065414860074
  (9, 722)	0.0010510025863958374
  (9, 734)	-2.5408012176432876e-06
  (9, 735)	-1.551226006561165e-05
  (9, 736)	0.007472854348537899
 

In [52]:
logit.densify()
len(logit.coef_)

10

In [54]:
sums = np.zeros(len(logit.coef_[0]))

In [60]:
for i in range(len(sums)):
    for j in range(len(logit.coef_)):
        sums[i] = sums[i] + abs(logit.coef_[j, i])

In [61]:
sums

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.94376845e-03, 2.77924787e-03,
       1.87068771e-04, 1.26740046e-03, 2.77863166e-04, 1.63534485e-04,
       6.62584610e-04, 5.57574714e-04, 9.26383292e-04, 2.08364048e-03,
       3.54850982e-04, 0.00000000e+00, 2.16354090e-04, 4.95811457e-04,
       4.82289327e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [72]:
sorted_sums = np.argsort(-abs(sums))
sorted_sums

array([350, 378, 401, 433, 487, 462, 488, 484, 345, 405, 403, 324, 349,
       346, 317, 434, 318, 240, 377, 319, 351, 516, 465, 517, 373, 657,
       353, 268, 291, 597, 599, 406, 352, 290, 658, 210, 213, 661, 294,
       571, 323, 437, 570, 545, 656, 461, 298, 299, 510, 190, 469, 379,
       490, 375, 459, 270, 356, 372, 330, 348, 486, 376, 596, 460, 404,
       260, 568, 430, 347, 489, 595, 438, 544, 413, 659, 519, 408, 322,
       371, 520, 374, 243, 296, 515, 409, 188, 407, 355, 235, 354, 233,
       325, 264, 241, 212, 130, 415, 492, 541, 301, 129, 518, 315, 128,
       463, 436, 153, 400, 435, 159, 687, 321, 329, 543, 432, 576, 342,
       631, 565, 440, 660, 237, 358, 686, 381, 652, 629, 273, 326, 685,
       211, 567, 627, 483, 491, 466, 578, 288, 457, 470, 414, 295, 538,
       154, 183, 267, 549, 320, 464, 272, 542, 458, 234, 546, 429, 485,
       263, 160, 244, 151, 178, 498, 214, 467, 598, 184, 289, 412, 569,
       331, 684, 131, 539, 182, 127, 431, 716, 572, 456, 328, 55

In [66]:
len(sorted_sums)

784

In [74]:
useful_indices = sorted_sums[0:80:1]
useful_indices

array([350, 378, 401, 433, 487, 462, 488, 484, 345, 405, 403, 324, 349,
       346, 317, 434, 318, 240, 377, 319, 351, 516, 465, 517, 373, 657,
       353, 268, 291, 597, 599, 406, 352, 290, 658, 210, 213, 661, 294,
       571, 323, 437, 570, 545, 656, 461, 298, 299, 510, 190, 469, 379,
       490, 375, 459, 270, 356, 372, 330, 348, 486, 376, 596, 460, 404,
       260, 568, 430, 347, 489, 595, 438, 544, 413, 659, 519, 408, 322,
       371, 520], dtype=int64)

In [82]:
X_hardcut = X[:, useful_indices]

In [83]:
len(X_hardcut)

70000

In [84]:
len(X_hardcut[0])

80

In [81]:
len(X[0])

784

In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(X_hardcut, Y, test_size=0.5, random_state=1998)
print(len(X_train), len(Y_train), len(X_test))
print(len(X_train[0]))

35000 35000 35000
80


In [86]:
logit = sklearn.linear_model.LogisticRegression(penalty='none', dual=False, tol=0.0001, C=1000.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='newton-cg', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
logit.fit(X_train[:1000], Y_train[:1000])
print(logit.score(X_train[0:1000], Y_train[0:1000]))
print(logit.score(X_train, Y_train))
print(logit.score(X_test, Y_test))

  "Setting penalty='none' will ignore the C and l1_ratio "


1.0
0.7282857142857143
0.7189714285714286


In [87]:
logit = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='lbfgs', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
params = {
            'solver': ['liblinear'],
            'penalty': ['l2'],
            'C': [ 5e-8, 1e-7]
         }
l2_logregcv = sklearn.model_selection.GridSearchCV(logit, params, scoring=None, n_jobs=None, 
                                                iid='deprecated', refit=True, cv=None, verbose=0, 
                                                pre_dispatch='2*n_jobs', return_train_score=False)

l2_logregcv.fit(X_train[:1000], Y_train[:1000])
display(pd.DataFrame(l2_logregcv.cv_results_).sort_values('rank_test_score').head())
print(l2_logregcv.score(X_train[0:1000], Y_train[0:1000]))
print(l2_logregcv.score(X_train, Y_train))
print(l2_logregcv.score(X_test, Y_test))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.020944,0.002821,0.000798,0.000399,1e-07,l2,liblinear,"{'C': 1e-07, 'penalty': 'l2', 'solver': 'libli...",0.68,0.69,0.71,0.705,0.685,0.694,0.011576,1
0,0.095194,0.15599,0.000798,0.000399,5e-08,l2,liblinear,"{'C': 5e-08, 'penalty': 'l2', 'solver': 'libli...",0.64,0.66,0.68,0.66,0.655,0.659,0.012806,2


0.721
0.6832
0.6809142857142857


In [88]:
logit = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                                                intercept_scaling=1, class_weight=None, random_state=1998, 
                                                solver='lbfgs', max_iter=100, multi_class='auto', 
                                                verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
params = {
            'solver': ['liblinear'],
            'penalty': ['l1'],
            'C': [6e-4, 5e-4, 4e-4]
         }
l1_logregcv = sklearn.model_selection.GridSearchCV(logit, params, scoring=None, n_jobs=None, 
                                                iid='deprecated', refit=True, cv=None, verbose=0, 
                                                pre_dispatch='2*n_jobs', return_train_score=False)

l1_logregcv.fit(X_train[:1000], Y_train[:1000])
display(pd.DataFrame(l1_logregcv.cv_results_).sort_values('rank_test_score').head())
print(l1_logregcv.score(X_train[0:1000], Y_train[0:1000]))
print(l1_logregcv.score(X_train, Y_train))
print(l1_logregcv.score(X_test, Y_test))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023392,0.001832,0.000199,0.000397,0.0006,l1,liblinear,"{'C': 0.0006, 'penalty': 'l1', 'solver': 'libl...",0.8,0.795,0.775,0.76,0.775,0.781,0.014629,1
1,0.021343,0.000798,0.000399,0.000489,0.0005,l1,liblinear,"{'C': 0.0005, 'penalty': 'l1', 'solver': 'libl...",0.78,0.77,0.755,0.75,0.78,0.767,0.01249,2
2,0.021349,0.003369,0.000399,0.000488,0.0004,l1,liblinear,"{'C': 0.0004, 'penalty': 'l1', 'solver': 'libl...",0.765,0.755,0.745,0.72,0.76,0.749,0.015937,3


0.825
0.7757142857142857
0.7707142857142857


2