In [17]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC

In [3]:
newsgroups = datasets.fetch_20newsgroups(
subset = 'all',
categories = ['alt.atheism', 'sci.space'])

In [5]:
X_train = newsgroups.data
y_train = newsgroups.target

In [11]:
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X_train)

In [27]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 241)
parameters = {'C':np.power(10.0, np.arange(-5,6))}
clf = SVC(kernel = 'linear', random_state = 241)
gs = GridSearchCV(clf, parameters, cv = cv, scoring = 'accuracy', n_jobs = -1, verbose = 1)
gs.fit(X_vect, y_train)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   33.2s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241), n_jobs=-1,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy', verbose=1)

In [43]:
C = gs.best_params_.get('C')

In [44]:
clf = SVC(C = C, kernel = 'linear', random_state = 241)

In [46]:
clf.fit(X_vect, y_train)

SVC(kernel='linear', random_state=241)

In [47]:
words = np.array(vectorizer.get_feature_names())

In [57]:
words_weights = pd.Series(clf.coef_.data, index = words[clf.coef_.indices], name = 'weight')
top_words = words_weights.abs().sort_values(ascending = False).head(10)

In [55]:
top_words

space       2.663165
god         1.920379
atheism     1.254690
atheists    1.249180
moon        1.201611
sky         1.180132
religion    1.139081
bible       1.130612
keith       1.097094
sci         1.029307
Name: weight, dtype: float64

In [67]:
top_words.index.sort_values()

Index(['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion',
       'sci', 'sky', 'space'],
      dtype='object')