In [1]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

from sklearn.grid_search      import GridSearchCV
from sklearn.svm              import SVC
from sklearn.cross_validation import KFold
from sklearn                  import datasets
from sklearn.feature_extraction.text import TfidfVectorizer

newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
            )
X, y = newsgroups.data, newsgroups.target

In [2]:
vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(newsgroups.data)
print vectors.shape, y.shape

(1786, 28382) (1786L,)


In [3]:
clf = SVC(kernel='linear', random_state=241)
clf.fit(vectors, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [4]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv).fit(vectors, y)

In [5]:
coef_ = np.abs(clf.coef_.toarray()[0])
print(coef_)

[ 0.29258057  0.12314757  0.         ...,  0.01972862  0.05831336
  0.00297347]


In [6]:
sort = np.array(coef_).argsort()[-10:]
print(sort)

[22936 15606  5776 21850 23673 17802  5093  5088 12871 24019]


In [7]:
categories=['alt.atheism', 'sci.space']
feature_mapping = vectorizer.get_feature_names()

for i in sort:
    print(feature_mapping[i])

sci
keith
bible
religion
sky
moon
atheists
atheism
god
space
