In [46]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV

In [47]:
newsgroups = datasets.fetch_20newsgroups(
                subset='all',
                categories=['alt.atheism', 'sci.space']
)

In [48]:
data = newsgroups.data
target = newsgroups.target

In [49]:
vectorizer = TfidfVectorizer()

In [50]:
X = vectorizer.fit_transform(data)

In [51]:
feature_mapping = vectorizer.get_feature_names()

In [55]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)

In [56]:
gs.fit(X, target)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=241, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [81]:
gs.best_estimator_.C

1.0

In [59]:
new_clf = SVC(C=1.0, random_state=241, kernel='linear')

In [60]:
new_clf.fit(X, target)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

In [85]:
coef = new_clf.coef_

In [98]:
res = pd.DataFrame(coef.toarray()).transpose()

top = abs(res).sort_values([0], ascending=False).head(10)

words = []

for row in top.iterrows():
    words.append(feature_mapping[row[0]])

In [100]:
with open("ans2", "w") as f:
    f.write(" ".join(sorted(words)))