In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [4]:
news_groups = datasets.fetch_20newsgroups(
    subset='all', 
    categories=['alt.atheism', 'sci.space']
)

In [5]:
x = news_groups.data
y = news_groups.target

In [7]:
vectorizer = TfidfVectorizer()

In [8]:
tf_tdf = vectorizer.fit_transform(x)

In [18]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}

In [21]:
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241)

In [22]:
clf = SVC(kernel='linear', random_state=241)

In [23]:
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)

In [34]:
gs.fit(tf_tdf, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=1786, n_folds=5, shuffle=True, random_state=241),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [38]:
for a in gs.grid_scores_:
    print a.mean_validation_score, a.parameters

 0.552631578947 {'C': 1.0000000000000001e-05}
0.552631578947 {'C': 0.0001}
0.552631578947 {'C': 0.001}
0.552631578947 {'C': 0.01}
0.950167973124 {'C': 0.10000000000000001}
0.993281075028 {'C': 1.0}
0.993281075028 {'C': 10.0}
0.993281075028 {'C': 100.0}
0.993281075028 {'C': 1000.0}
0.993281075028 {'C': 10000.0}
0.993281075028 {'C': 100000.0}


In [40]:
gs.best_params_

{'C': 1.0}

In [41]:
release_clf = SVC(kernel='linear', random_state=241)

In [43]:
release_clf.fit(tf_tdf, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
np.argsort(np.abs(np.asarray(release_clf.coef_.todense())).reshape(-1))[-10:]

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871, 24019])

In [89]:
top_idx = pd.Series(release_clf.coef_.toarray().reshape(-1)).abs().nlargest(10).index

In [90]:
words = [vectorizer.get_feature_names()[i] for i in top_idx] 

In [91]:
words

[u'space',
 u'god',
 u'atheism',
 u'atheists',
 u'moon',
 u'sky',
 u'religion',
 u'bible',
 u'keith',
 u'sci']

In [94]:
words.sort()

In [95]:
words

[u'atheism',
 u'atheists',
 u'bible',
 u'god',
 u'keith',
 u'moon',
 u'religion',
 u'sci',
 u'sky',
 u'space']