In [0]:
#!pip install eli5

In [0]:
import sklearn
import eli5
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


Возьмем 4 произвольные темы из fetch_20newsgroups

In [0]:
categories = [ 'rec.autos',  'rec.sport.hockey',  'sci.space',  'talk.politics.mideast']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

In [0]:
X_train = twenty_train.data
X_test = twenty_test.data
y_train = twenty_train.target
y_test = twenty_test.target

Векторизуем датасет

In [0]:
count_vect = CountVectorizer()
count_vect.fit(X_train)
X_train_vec = count_vect.transform(X_train)
X_test_vec = count_vect.transform(X_test)

Напишем функцию для grid_search

In [0]:
def evaluation (model, grid_param, X_train, y_train, X_test, y_test, folds=5):
  #  если параметр cv - целое число, то используется StratifiedKFold
  grid_search = GridSearchCV(model, param_grid=grid_param,  scoring='f1_macro', cv=folds)
  grid_search.fit(X_train, y_train)
  return grid_search

Функция оценки полученных классификаторов на тестовых данных

In [0]:
def validation(model, X_train, y_train, X_test, y_test):  
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  top_f1 = f1_score(y_test, y_pred, average='macro')
  return top_f1

Зададим параметры grid_search для разных классификаторов

In [0]:
parameter_grid_lr = {'C': [0.1, 1, 5], 'penalty' : ['l2', 'l1'], 'solver': ['liblinear', 'saga']}
parameter_grid_svc = {'C': [0.01, 0.1, 1, 5, 10], 'penalty' : ['l2', 'l1'], 'loss': ['squared_hinge', 'hinge']}
parameter_grid_sgd = {'alpha': [0.001, 0.01, 0.1, 1], 'penalty': ['l2', 'l1', 'elasticnet'], 'loss': ['log', 'hinge', 'perceptron']}

Проверим 3 классификатора

LogisticRegression

In [0]:
grid_search = evaluation(LogisticRegression(), parameter_grid_lr, X_train_vec, y_train, X_test_vec, y_test)

In [16]:
grid_search.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [0]:
top_model_lr = LogisticRegression(**grid_search.best_params_)
f1_lr = validation(top_model_lr, X_train_vec, y_train, X_test_vec, y_test) 

In [18]:
print('LogisticRegression train f1: {}'.format(grid_search.best_score_))
print('LogisticRegression test f1: {}'.format(f1_lr))

LogisticRegression train f1: 0.9800913785568566
LogisticRegression test f1: 0.9615435029357767


LinearSVC

In [0]:
grid_search2 = evaluation(LinearSVC(), parameter_grid_svc, X_train_vec, y_train, X_test_vec, y_test)

In [0]:
top_model_svc = LinearSVC(**grid_search2.best_params_)
f1_svc = validation(top_model_svc, X_train_vec, y_train, X_test_vec, y_test) 

In [152]:
grid_search2.best_params_

{'C': 0.01, 'loss': 'squared_hinge', 'penalty': 'l2'}

In [153]:
print('LinearSVC train f1: {}'.format(grid_search2.best_score_))
print('LinearSVC test f1: {}'.format(f1_svc))

LinearSVC train f1: 0.978437281437316
LinearSVC test f1: 0.9628275959010326


SGDClassifier

In [0]:
grid_search3 = evaluation(SGDClassifier(), parameter_grid_sgd, X_train_vec, y_train, X_test_vec, y_test)

In [144]:
grid_search3.best_params_

{'alpha': 0.1, 'loss': 'hinge', 'penalty': 'l2'}

In [0]:
top_model_sgd = SGDClassifier(**grid_search3.best_params_)

f1_sgd =validation(top_model_sgd, X_train_vec, y_train, X_test_vec, y_test) 

In [146]:
print('SGDClassifier train f1: {}'.format(grid_search3.best_score_))
print('SGDClassifier test f1: {}'.format(f1_sgd))

SGDClassifier train f1: 0.9784558145371595
SGDClassifier test f1: 0.9581390156846059


Функция analyze_features(model, n)

In [0]:
index_to_word = {v:k for k,v in count_vect.vocabulary_.items()} 

In [0]:
def analyze_features(model, n):
  df_list = []
  for i in range (len(categories)): 
    df = eli5.formatters.as_dataframe.explain_weights_df(model) 
    df_0 = df[df.target==i].sort_values(by=['weight'], ascending=False)[:n]
    df_0['feature'] = df_0['feature'].str[1:].astype(int)
    df_0['word'] =  df_0['feature'].map(index_to_word)
    df_0['category'] = categories[i]
    df_0 = df_0[['word', 'weight', 'feature', 'category']]
    df_0.index =  np.arange(1, len(df_0)+1)
    df_list += [df_0]
  main_df = pd.concat(df_list)
  return main_df

In [125]:
analyze_features(top_model_lr, 10)

Unnamed: 0,word,weight,feature,category
1,car,1.59672,9145,rec.autos
2,cars,0.983316,9213,rec.autos
3,hp,0.581088,18332,rec.autos
4,auto,0.550256,6754,rec.autos
5,my,0.546484,24010,rec.autos
6,bmw,0.545183,7922,rec.autos
7,automotive,0.494071,6778,rec.autos
8,distribution,0.473639,13010,rec.autos
9,ford,0.469328,15728,rec.autos
10,honda,0.45944,18169,rec.autos


Ошибки:
Cars: my; hockey: pittsburgh; space: access; mideast: columbia, argic, fourd

In [124]:
analyze_features(top_model_svc, 10)

Unnamed: 0,word,weight,feature,category
1,car,0.292303,9145,rec.autos
2,cars,0.176118,9213,rec.autos
3,hp,0.112859,18332,rec.autos
4,bmw,0.105531,7922,rec.autos
5,distribution,0.103119,13010,rec.autos
6,auto,0.101252,6754,rec.autos
7,usa,0.097957,35425,rec.autos
8,my,0.095346,24010,rec.autos
9,automotive,0.092963,6778,rec.autos
10,ford,0.090654,15728,rec.autos


Ошибки:
Cars: my,usa; hockey: pittsburgh; space: -; mideast: columbia, angmar

In [126]:
analyze_features(top_model_sgd, 10)

Unnamed: 0,word,weight,feature,category
1,car,0.340239,9145,rec.autos
2,cars,0.210523,9213,rec.autos
3,hp,0.112438,18332,rec.autos
4,distribution,0.111641,13010,rec.autos
5,ford,0.103135,15728,rec.autos
6,my,0.102869,24010,rec.autos
7,usa,0.101008,35425,rec.autos
8,bmw,0.093832,7922,rec.autos
9,drive,0.087718,13359,rec.autos
10,engine,0.087452,14126,rec.autos


Ошибки:
Cars: my,usa; hockey: pittsburgh; space: alaska; mideast: pro, argic, cosmo, angmar

Поменяем параметры CountVectorizer

In [0]:
cv = CountVectorizer( min_df=1, max_df=0.85)


In [0]:
cv.fit(X_train)

In [0]:
X_train_vec = cv.transform(X_train)
X_test_vec = cv.transform(X_test)

In [0]:
def compare(name, trainf1_b, testf1_b, trainf1_n, testf1_n):
  print('before:')
  print('{} train f1: {}'.format(name, trainf1_b))
  print('{} test f1: {}'.format(name, testf1_b))
  print('now:')
  print('{} train f1: {}'.format(name, trainf1_n))
  print('{} test f1: {}'.format(name, testf1_n))

Svc

In [0]:
grid_search4 = evaluation(LinearSVC(), parameter_grid_svc, X_train_vec, y_train, X_test_vec, y_test)

In [0]:
top_model_svc = LinearSVC(**grid_search4.best_params_)
f1_svc_2 = validation(top_model_svc, X_train_vec, y_train, X_test_vec, y_test) 

In [274]:
compare('LinearSVC', grid_search2.best_score_, f1_svc, grid_search4.best_score_, f1_svc_2)

before:
LinearSVC train f1: 0.978437281437316
LinearSVC test f1: 0.9628275959010326
now:
LinearSVC train f1: 0.980575175567062
LinearSVC test f1: 0.9634799973877469


SGDClassifier

In [0]:
grid_search5 = evaluation(SGDClassifier(), parameter_grid_sgd, X_train_vec, y_train, X_test_vec, y_test)

In [0]:
top_model_sgd = SGDClassifier(**grid_search5.best_params_)
f1_sgd_2 = validation(top_model_sgd, X_train_vec, y_train, X_test_vec, y_test) 

In [279]:
compare('SGDClassifier', grid_search3.best_score_, f1_sgd, grid_search5.best_score_, f1_sgd_2)

before:
SGDClassifier train f1: 0.9784558145371595
SGDClassifier test f1: 0.9581390156846059
now:
SGDClassifier train f1: 0.9780226387684386
SGDClassifier test f1: 0.9610239830911664


LogisticRegression

In [0]:
grid_search6 = evaluation(LogisticRegression(), parameter_grid_lr, X_train_vec, y_train, X_test_vec, y_test)

In [0]:
top_model_lr = LogisticRegression(**grid_search6.best_params_)
f1_lr_2 = validation(top_model_lr, X_train_vec, y_train, X_test_vec, y_test) 

In [283]:
compare('Logistic Regression', grid_search5.best_score_, f1_lr, grid_search5.best_score_, f1_lr_2)

before:
Logistic Regression train f1: 0.9780226387684386
Logistic Regression test f1: 0.9615435029357767
now:
Logistic Regression train f1: 0.9780226387684386
Logistic Regression test f1: 0.9660788217300081


Уменьшение параметра max_df до 0.85 немного улучшило результаты для всех моделей при проверке на тестовых данных