In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn import datasets
from sklearn.metrics import roc_auc_score
from numpy.linalg import norm

In [2]:
data = pd.read_csv('data/svm-data.csv', header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2
0,0.0,0.7,0.29
1,1.0,0.23,0.55
2,0.0,0.72,0.42
3,0.0,0.98,0.68
4,0.0,0.48,0.39


In [4]:
y = data[0]
x = data.loc[:, [1, 2]]

In [5]:
from sklearn.svm import SVC

In [6]:
model = SVC(C=100000, kernel='linear', random_state=241)

In [7]:
model.fit(x, y)

SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
model.support_

array([3, 4, 9], dtype=int32)

## Задание 2

In [9]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [10]:
x = newsgroups.data
y = newsgroups.target

In [11]:
cv = TfidfVectorizer()
cv_fit = cv.fit_transform(x)

In [12]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
k_fold = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=k_fold)
gs.fit(cv_fit, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [13]:
gs.best_estimator_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
model = SVC(C=1, kernel='linear', random_state=241)

In [15]:
model.fit(cv_fit, y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
words = model.coef_[0].toarray()[0]
idx = np.argsort(abs(words))[-10:]

In [17]:
feature_mapping = cv.get_feature_names()

In [18]:
','.join(sorted([feature_mapping[i] for i in idx]))

'atheism,atheists,bible,god,keith,moon,religion,sci,sky,space'

## Задание 3

In [19]:
data = pd.read_csv('data/data-logistic.csv', header=None)

In [20]:
x = data.loc[:, [1, 2]]
y = data[0]

In [21]:
w0 = np.array([0.0, 0.0])
w = np.array([0.0, 0.0])
M = 10000
k = 0.1
eps = 1e-5
it = 0
c = 10
l = x.shape[0]

while True:
    w[0] = w0[0] + k / l * np.sum([yi * xi[0] * (1 - 1/(1 + np.exp(-yi * (w0[0] * xi[0] + w0[1] * xi[1])))) 
                                       for xi, yi in zip(x.values, y.values)]) - k*c*w0[0] 
    w[1] = w0[1] + k / l * np.sum([yi * xi[1] * (1 - 1/(1 + np.exp(-yi * (w0[0] * xi[0] + w0[1] * xi[1]))))
                                       for xi, yi in zip(x.values, y.values)]) - k*c*w0[1] 
    it += 1
    if norm(w - w0) < eps or it == M:
        break
    w0 = w.copy()
# Answer -- w

In [22]:
res2

NameError: name 'res2' is not defined

In [None]:
round(roc_auc_score(y, 1 / (1 + np.exp(-res1[0] * x[1] - res1[1] * x[2]))), 3)

In [None]:
round(roc_auc_score(y, 1 / (1 + np.exp(-res2[0] * x[1] - res2[1] * x[2]))), 3)

## Задание 3

In [None]:
data = pd.read_csv('data/classification.csv')

In [None]:
data.head()

In [None]:
pd.crosstab(data['pred'], data['true'])

In [None]:
from sklearn import metrics

In [None]:
print(metrics.accuracy_score(data['true'], data['pred']).round(2))

In [None]:
print(metrics.precision_score(data['true'], data['pred']).round(2))

In [None]:
print(metrics.recall_score(data['true'], data['pred']).round(2))

In [None]:
print(metrics.f1_score(data['true'], data['pred']).round(2))

In [None]:
data = pd.read_csv('data/scores.csv')

In [None]:
data.head()

In [None]:
for col_name in data.columns[1:]:
    score = roc_auc_score(data['true'], data[col_name])
    print(f'{col_name} : {score}')

In [None]:
for col_name in data.columns[1:]:
    score = precision_recall_curve(data['true'], data[col_name])
    print(f'{col_name} : {np.max(score[0][np.argwhere(score[1] >= 0.7)]).round(2)}')

In [None]:
round(900/1000, 2)