In [26]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [2]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='train')
news_test = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='test')


## Using Chi2 Criteria

In [3]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
counts_train = vectorizer.fit_transform(news_train.data)
counts_test = vectorizer.transform(news_test.data)

In [11]:
chi_20, p = chi2(counts_train, news_train.target)
imp_features = np.argsort(chi_20)[::-1][:200]
[{v: k for k, v in vectorizer.vocabulary_.items()}[d] for d in imp_features[:10]]

['clipper',
 'encryption',
 'sale',
 'dod',
 'bike',
 'hockey',
 'windows',
 'israeli',
 'israel',
 'god']

In [13]:
new_counts_train = counts_train[:, imp_features]
new_counts_test = counts_test[:, imp_features]

In [14]:
logisticRegr = LogisticRegression(penalty='l2')
logisticRegr.fit(chi_counts_train, news_train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
yhat = logisticRegr.predict(chi_counts_train)
print(sum(yhat == news_train.target)/len(news_train.target))

yhat_test = logisticRegr.predict(chi_counts_test)
print(sum(yhat_test == news_test.target)/chi_counts_test.shape[0])

0.707353721053562
0.6344928305894849


In [24]:
chi_counts_test.shape

(7532, 200)

## Using Mutual Information

In [31]:
mi = mutual_info_classif(counts_train, news_train.target, random_state=666)

In [45]:
imp_features = np.argsort(mi)[::-1][:200]
#[{v: k for k, v in vectorizer.vocabulary_.items()}[d] for d in imp_features[:10]]
mi_train = counts_train[:, imp_features]
mi_test = counts_test[:, imp_features]

In [46]:
logisticRegr = LogisticRegression(penalty='l2')
logisticRegr.fit(mi_train, news_train.target)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
yhat = logisticRegr.predict(mi_train)
print(sum(yhat == news_train.target)/len(news_train.target))

yhat_test = logisticRegr.predict(mi_test)
print(sum(yhat_test == news_test.target)/chi_counts_test.shape[0])

0.7352837192858406
0.6313064259160913
