In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("svm-data.csv", header = None)
df

Unnamed: 0,0,1,2
0,0.0,0.7,0.29
1,1.0,0.23,0.55
2,0.0,0.72,0.42
3,0.0,0.98,0.68
4,0.0,0.48,0.39
5,1.0,0.34,0.73
6,0.0,0.44,0.06
7,1.0,0.4,0.74
8,0.0,0.18,0.18
9,1.0,0.53,0.53


In [3]:
x = df.drop(0, axis = 1).values
y = df[0].values

In [4]:
clf = SVC(kernel = "linear", C = 100000, random_state = 241)

In [5]:
clf.fit(x, y)

In [6]:
print(np.sort(clf.support_+1))

[ 4  5 10]


In [7]:
x, y = fetch_20newsgroups(subset = "all", categories = ["alt.atheism", "sci.space"], return_X_y = True)

In [8]:
vectorizer = TfidfVectorizer()

In [9]:
x_scaled = vectorizer.fit_transform(x)

In [10]:
grid = {"C": np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits = 5, shuffle = True, random_state = 241)
clf = SVC(kernel = "linear", random_state = 241)
gs = GridSearchCV(clf, grid, scoring = "accuracy", cv = cv)
gs.fit(x_scaled, y)

In [17]:
best_C = gs.best_params_["C"]

In [18]:
clf = SVC(kernel = "linear", random_state = 241, C = best_C)
clf.fit(x_scaled, y)

In [19]:
absolute_data = abs(clf.coef_.toarray().reshape(-1))
absolute_data_sorted_desc = sorted(absolute_data, reverse = True)
weight_indexes = []
for weight in absolute_data_sorted_desc[:10]:
    weight_indexes.append(absolute_data.tolist().index(weight))
words = [vectorizer.get_feature_names_out()[index] for index in weight_indexes]
print(",".join(sorted(words)))

atheism,atheists,bible,god,keith,moon,religion,sci,sky,space
