In [121]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from eli5 import show_weights
from nltk.corpus import stopwords

In [2]:
cs = ['comp.sys.ibm.pc.hardware', 'sci.electronics', 'talk.politics.mideast', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cs)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cs)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [41]:
cv = CountVectorizer(50)
cv_train = cv.fit_transform(newsgroups_train.data)
cv_test = cv.transform(newsgroups_test.data)

In [48]:
def print_metrics(model):
    y_pred_LogReg = model.best_estimator_.predict(cv_train)
    train_score_LogReg  = f1_score(newsgroups_train.target, y_pred_LogReg, average='macro')
    print(train_score_LogReg)
    y_pred_LogReg = model.best_estimator_.predict(cv_test)
    test_score_LogReg  = f1_score(newsgroups_test.target, y_pred_LogReg, average='macro')
    print(test_score_LogReg)

In [42]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

params = {
    'weights': ('uniform', 'distance'),
    'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'leaf_size': [10, 20, 30, 40, 50],
    'n_neighbors': list(range(4, 20)),
}
model1 = GridSearchCV(KNeighborsClassifier(), params, cv=folds, n_jobs=-1, verbose=True)
model1.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 832 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 1532 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 2432 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 3532 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 4832 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 6332 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 6393 out of 6400 | elapsed:  2.6min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed:  2.6min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                         15, 16, 17, 18, 19],
                         'weights': ('uniform', 'distance')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [49]:
print_metrics(model1)

1.0
0.589462176908942


In [52]:
params2 = {
    'alpha': [x / 10 for x in range(0, 10)],
    'fit_prior': (True, False),
}
model2 = GridSearchCV(MultinomialNB(), params2, cv=folds, n_jobs=-1, verbose=True)
model2.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.5s finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                   0.9],
                         'fit_prior': (True, False)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [53]:
print_metrics(model2)

0.9964802157252681
0.9071745496879822


In [59]:
params3 = {
    'C': [x / 10 for x in range(1, 11)],
    'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
}
model3 = GridSearchCV(LogisticRegression(), params3, cv=folds, n_jobs=-1, verbose=True)
model3.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.4min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                               1.0],
                         'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga')},
             pre_dispatch='2*n_jobs', 

In [60]:
print_metrics(model3)

1.0
0.8840173243272906


In [64]:
index_to_word = {v:k for k,v in cv.vocabulary_.items()}

In [114]:
def analyze_features(weights, n):
    model_weights = [abs(w) for w in weights]
    d = cv.get_feature_names()
    features = [(weight, word) for weight, word in zip(weights, d)]
    features.sort(reverse=True, key=lambda x: x[0])
    return features

In [117]:
show_weights(model1.best_estimator_, top=10)

In [119]:
show_weights(model2.best_estimator_, top=10)

In [120]:
show_weights(model3.best_estimator_, top=10)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.778,x21621,,
+0.671,x24641,,
+0.643,x12559,,
+0.640,x8433,,
+0.635,x22373,,
+0.628,x15455,,
+0.624,x12407,,
+0.622,x34534,,
+0.616,x28627,,
+0.603,x2750,,

Weight?,Feature
+0.778,x21621
+0.671,x24641
+0.643,x12559
+0.640,x8433
+0.635,x22373
+0.628,x15455
+0.624,x12407
+0.622,x34534
+0.616,x28627
+0.603,x2750

Weight?,Feature
+0.751,x13120
+0.725,x9112
+0.684,x25547
+0.649,x22390
+0.602,x32610
+0.588,x26487
+0.578,x33306
+0.560,x11623
… 12896 more positive …,… 12896 more positive …
… 22353 more negative …,… 22353 more negative …

Weight?,Feature
+1.238,x18770
+1.150,x18771
+0.659,x19024
+0.532,x32581
+0.521,x28899
+0.511,x15039
+0.497,x28421
… 10694 more positive …,… 10694 more positive …
… 24555 more negative …,… 24555 more negative …
-0.487,x31704

Weight?,Feature
+0.827,x9278
+0.594,x19338
+0.516,x20979
+0.506,x20101
+0.492,x28980
+0.484,x7183
… 12134 more positive …,… 12134 more positive …
… 23115 more negative …,… 23115 more negative …
-0.569,<BIAS>
-0.591,x30763


Хорошо, что хоть регрессию поддерживает((  
Ошибко не видно, но можно взять что-то поумнее чем тупо `CountVectorizer(50)`

In [122]:
cv = CountVectorizer(ngram_range=(1,3), stop_words=stopwords.words('english'))
cv_train = cv.fit_transform(newsgroups_train.data)
cv_test = cv.transform(newsgroups_test.data)

In [123]:
params = {
    'weights': ('uniform', 'distance'),
    'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'leaf_size': [10, 20, 30, 40, 50],
    'n_neighbors': list(range(4, 20)),
}
model1 = GridSearchCV(KNeighborsClassifier(), params, cv=folds, n_jobs=-1, verbose=True)
model1.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed:  5.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                         15, 16, 17, 18, 19],
                         'weights': ('uniform', 'distance')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [124]:
print_metrics(model1)

1.0
0.5088860309452249


In [125]:
params2 = {
    'alpha': [x / 10 for x in range(0, 10)],
    'fit_prior': (True, False),
}
model2 = GridSearchCV(MultinomialNB(), params2, cv=folds, n_jobs=-1, verbose=True)
model2.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                   0.9],
                         'fit_prior': (True, False)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [126]:
print_metrics(model2)

0.9995766299745978
0.9014636600890609


In [127]:
params3 = {
    'C': [x / 10 for x in range(1, 11)],
    'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
}
model3 = GridSearchCV(LogisticRegression(), params3, cv=folds, n_jobs=-1, verbose=True)
model3.fit(cv_train, newsgroups_train.target)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 52.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 59.2min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                               1.0],
                         'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga')},
             pre_dispatch='2*n_jobs', 

In [128]:
print_metrics(model3)

1.0
0.8862395402742882
