## PROBLEM 5 : L1 feature selection on text

In [3]:
import pandas as pd
import mnist
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn import tree
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
news_train = fetch_20newsgroups(data_home='../data/20newsgroups/', 
                                subset='train', 
                                remove=('headers', 'footers', 'quotes'))

news_test = fetch_20newsgroups(data_home='../data/20newsgroups/', 
                                subset='test', 
                                remove=('headers', 'footers', 'quotes'))


In [5]:
vectorizer  = CountVectorizer(binary=True, stop_words='english')
train_img   = vectorizer.fit_transform(news_train.data)
test_img    = vectorizer.transform(news_test.data)
train_lbl   = news_train.target
test_lbl    = news_test.target

logisticRegr = LogisticRegression(penalty='l1', C = 0.1)
logisticRegr.fit(train_img, train_lbl)
logisticRegr.predict(test_img[0:10])
predictions = logisticRegr.predict(train_img)
score = logisticRegr.score(train_img, train_lbl)

In [9]:
coef = (logisticRegr.coef_)

In [10]:
coef.shape

(20, 101322)

In [11]:
coeff_df = pd.DataFrame(np.argsort(np.flip(coef, axis = 1)))

In [12]:
coeff_df = pd.DataFrame(np.argsort(coef))

In [23]:
for index, row in coeff_df.iterrows():
    
    train_img[index] =  coef[index][coeff_df.iloc[index].values]



In [24]:
coef[index][coeff_df.iloc[index].values]

array([-0.58308798, -0.54429948, -0.37712688, ...,  0.90004114,
        1.4306715 ,  2.00378059])

In [25]:
train_img.shape

(11314, 101322)

In [13]:
coeff_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101312,101313,101314,101315,101316,101317,101318,101319,101320,101321
0,88277,27138,92600,1469,95488,65475,99605,46158,96461,2777,...,77072,19462,67200,84026,51063,62944,51064,20137,23401,20133
1,70436,2593,53808,56790,57109,35078,59153,57219,76269,65475,...,40237,17874,72071,84147,48972,18616,72383,48959,6880,44152
2,40961,70436,80092,72393,22913,88835,31927,99608,48959,57219,...,92637,61465,35086,63536,96428,35084,40217,22965,27333,96461
3,70436,49413,99608,58777,80092,55617,80430,83871,28151,72393,...,72205,29593,42639,22855,25754,35087,70140,7955,48521,24710
4,96461,29593,70140,70436,28405,99608,2777,81587,34776,41728,...,66649,48792,81038,55778,74958,82563,26393,72399,58777,19094
5,72393,70436,25754,35078,99608,43852,58777,34776,31927,88487,...,86095,28151,61999,98302,97482,96452,81560,98773,96316,63082
6,88487,46158,73103,76207,80430,70436,71950,49800,88835,54493,...,36809,72952,23943,50372,19831,29027,81315,67520,82051,80092
7,22729,43957,92600,1469,70436,68285,92603,96461,73220,43733,...,20549,20487,96134,37208,67645,89467,32086,41051,25863,25717
8,92600,92637,70436,34617,22223,96461,73103,71950,40961,49822,...,63109,45619,23325,78172,78157,46151,63108,22734,34594,22729
9,92600,70436,65675,46897,73103,96461,92637,65475,80092,65165,...,84769,79398,21447,61042,71364,23976,71368,18045,70948,21691


In [15]:
imp_features = set()
for index, row in coeff_df.iloc[:,0:200].iterrows():
    imp_features.update(row.unique())


In [16]:
imp_features

{0,
 1469,
 1470,
 2222,
 2425,
 2593,
 2777,
 2939,
 3247,
 4306,
 6044,
 8581,
 12265,
 13575,
 16214,
 16417,
 16715,
 17372,
 18237,
 19094,
 19689,
 20583,
 21323,
 21698,
 22223,
 22447,
 22475,
 22729,
 22913,
 23572,
 25437,
 25717,
 25754,
 25910,
 26962,
 26997,
 27138,
 27148,
 27153,
 28151,
 28355,
 28405,
 28443,
 28449,
 28881,
 29027,
 29584,
 29593,
 30091,
 30359,
 30988,
 30989,
 31820,
 31927,
 33485,
 33493,
 33540,
 33588,
 34011,
 34617,
 34619,
 34696,
 34776,
 35078,
 36234,
 36809,
 37118,
 37351,
 38371,
 39225,
 39436,
 40217,
 40961,
 41468,
 41728,
 42489,
 42913,
 43142,
 43387,
 43711,
 43733,
 43777,
 43852,
 43914,
 43957,
 44152,
 44241,
 44440,
 44711,
 45557,
 45783,
 46158,
 46527,
 46897,
 47314,
 48523,
 48959,
 49413,
 49414,
 49800,
 49822,
 50372,
 51088,
 52019,
 52392,
 52861,
 53808,
 54079,
 54083,
 54493,
 55544,
 55617,
 55691,
 55970,
 56169,
 56530,
 56591,
 56676,
 56790,
 56849,
 56988,
 57109,
 57219,
 57244,
 57250,
 57335,
 58777,

In [28]:
train_img = train_img.todense()[:,list(imp_features)]
test_img = test_img.todense()[:,list(imp_features)]

In [17]:
logisticRegr = LogisticRegression(penalty='l2', C = 0.1)
logisticRegr.fit(train_img, train_lbl)
predictions = logisticRegr.predict(train_img)
score = logisticRegr.score(train_img, train_lbl)

In [18]:
score

0.9182428849213364

In [19]:
logisticRegr.score(test_img, test_lbl)

0.6512214551248009