In [3]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import heapq

In [101]:
all_categories = fetch_20newsgroups().target_names
all_categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

Возьмём темы из одного раздела, возможно, их будет сложнее отличать друг от друга

In [102]:
categories = [
    'sci.electronics',
    'sci.space',
    'sci.med',
]
train_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

Для векторизации текстов воспользуемся CountVectorizer, он представляет документ как мешок слов. Можно всячески варировать извлечение признаков (убирать редкие слова, убирать частые слова, убирать слова общей лексики, брать биграмы и т.д.)

In [103]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
CountVectorizer()

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [38]:
count_vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2)) 

In [39]:
sparse_feature_matrix = count_vectorizer.fit_transform(train_data.data)
sparse_feature_matrix

<2372x13348 sparse matrix of type '<type 'numpy.int64'>'
	with 281155 stored elements in Compressed Sparse Row format>

In [40]:
num_2_words = {
    v: k
    for k, v in count_vectorizer.vocabulary_.iteritems()
}

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

Обучим логистическую регрессию для предсказания темы документа

In [42]:
algo = LogisticRegression()
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Слова с наибольшим положительным весом, являются характерными словами темы

In [43]:
W = algo.coef_.shape[1]
for c in algo.classes_:
    topic_words = [
        num_2_words[w_num]
        for w_num in heapq.nlargest(10, range(W), key=lambda w: algo.coef_[c, w])
    ]
    print ',  '.join(topic_words)


car,  cars,  gt,  toyota,  ford,  engine,  auto,  vw,  dealer,  oil
circuit,  electronics,  chips,  tv,  chip,  the number,  used,  power,  found,  you could
msg,  doctor,  medical,  blood,  health,  disease,  treatment,  corn,  needles,  skin
space,  orbit,  nasa,  thanks for,  launch,  earth,  solar,  moon,  spacecraft,  earlier


Сравним качество на фолдах с качеством на трейне и на отложенном тесте

In [44]:
algo = LogisticRegression()
arr = cross_val_score(algo, sparse_feature_matrix, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.79411765  0.78947368  0.81684211  0.7742616   0.7690678 ]
0.788752567304


Почему это неправильная кроссвалидация?

In [64]:
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
accuracy_score(algo.predict(sparse_feature_matrix), train_data.target)

0.97765598650927488

In [47]:
accuracy_score(algo.predict(count_vectorizer.transform(test_data.data)), test_data.target)

0.75110829639012033

Мы видим переобучение, это проклятие размерности

In [48]:
algo = LogisticRegression(penalty='l1', C=0.1)
arr = cross_val_score(algo, sparse_feature_matrix, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.71638655  0.66947368  0.68842105  0.70464135  0.69067797]
0.693920121555


In [49]:
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
accuracy_score(algo.predict(sparse_feature_matrix), train_data.target)

0.76096121416526141

In [51]:
accuracy_score(algo.predict(count_vectorizer.transform(test_data.data)), test_data.target)

0.66307789740341994

Добавление регуляризатора уменьшает отличие на трейне и тесте, но ухудшает качество. Поиграйтесь с параметрами регуляризации, чтобы получить максимальное качество.

Чтобы не делать векторизацию и обучение раздельно, есть удобный класс Pipeline. Он позволяет объединить в цепочку последовательность действий

In [52]:
from sklearn.pipeline import Pipeline

In [53]:
pipeline = Pipeline([("vectorizer", CountVectorizer(min_df=5, ngram_range=(1, 2))), ("algo", LogisticRegression())])

In [54]:
pipeline.fit(train_data.data, train_data.target)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [55]:
accuracy_score(pipeline.predict(train_data.data), train_data.target)

0.97765598650927488

In [56]:
accuracy_score(pipeline.predict(test_data.data), test_data.target)

0.75110829639012033

Значения ровно такие же как мы получали ранее, делаяя шаги раздельно.

In [57]:
from sklearn.pipeline import make_pipeline

При кроссвалидации нужно, чтобы CountVectorizer не обучался на тесте (иначе объекты становятся зависимыми). Pipeline позволяет это просто сделать.

In [58]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), LogisticRegression())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.78781513  0.78526316  0.81684211  0.75527426  0.75847458]
0.780733845417


В Pipeline можно добавлять новые шаги препроцессинга данных

In [59]:
from sklearn.feature_extraction.text import TfidfTransformer

In [60]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), TfidfTransformer(), LogisticRegression())
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.84453782  0.84631579  0.83789474  0.81434599  0.81567797]
0.831754459821


In [61]:
pipeline.fit(train_data.data, train_data.target)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [62]:
accuracy_score(pipeline.predict(train_data.data), train_data.target)

0.96543001686340646

In [63]:
accuracy_score(pipeline.predict(test_data.data), test_data.target)

0.79924002533248895

Качество стало немного лучше

#Задание

1. Поиграйтесь с параметрами регуляризации, параметрами CountVectorizer и TfidfTransformer, чтобы получить максимальное качество.
2. Постройте список важных слов и словосочетаний для каждой темы (на основе значений коэффициентов)

In [73]:
for C in np.arange(5, 15):
    print C
    pipeline = make_pipeline(CountVectorizer(min_df=1, ngram_range=(3, 3)), TfidfTransformer(), LogisticRegression(penalty='l2', max_iter=100, C=C))
    arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
    print arr
    print np.mean(arr)

5
[ 0.89635854  0.87359551  0.86235955  0.86760563  0.86723164]
0.873430174364
6
[ 0.88795518  0.87359551  0.86797753  0.86760563  0.86440678]
0.872308125849
7
[ 0.88795518  0.87640449  0.86516854  0.87042254  0.86440678]
0.872871506131
8
[ 0.88795518  0.87640449  0.86516854  0.87042254  0.86158192]
0.872306534379
9
[ 0.8907563   0.87921348  0.86516854  0.87042254  0.86158192]
0.873428556222
10
[ 0.8907563   0.87921348  0.86516854  0.87042254  0.86158192]
0.873428556222
11
[ 0.89635854  0.87921348  0.86516854  0.86760563  0.86158192]
0.873985624119
12
[ 0.89635854  0.87921348  0.86516854  0.87042254  0.86158192]
0.874549004401
13
[ 0.89635854  0.87640449  0.86516854  0.86760563  0.86158192]
0.873423826366
14
[ 0.89635854  0.87921348  0.86516854  0.86760563  0.86158192]
0.873985624119


In [75]:
pipeline = make_pipeline(CountVectorizer(min_df=1, ngram_range=(1, 1)), TfidfTransformer(), LogisticRegression(penalty='l2', max_iter=100, C=12))
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.91876751  0.8988764   0.89044944  0.88169014  0.86158192]
0.89027308229


In [77]:
from stop_words import get_stop_words

stop_words = get_stop_words('en')

In [76]:
for i in range(8):
    print i
    pipeline = make_pipeline(CountVectorizer(min_df=2, ngram_range=(1, 1)), TfidfTransformer(), LogisticRegression(penalty='l2', max_iter=100, C=12))
    arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
    print arr
    print np.mean(arr)

0
[ 0.91876751  0.8988764   0.89044944  0.88169014  0.86158192]
0.89027308229
1
[ 0.91876751  0.8988764   0.89044944  0.88169014  0.86158192]
0.89027308229
2
[ 0.91596639  0.89606742  0.89606742  0.87887324  0.85875706]
0.88914630392
3
[ 0.90756303  0.88764045  0.88483146  0.87887324  0.86440678]
0.884662990884
4
[ 0.91036415  0.88202247  0.87921348  0.88169014  0.85310734]
0.881279517238
5
[ 0.9047619   0.88483146  0.87359551  0.87887324  0.84463277]
0.87733897577
6
[ 0.89355742  0.87359551  0.87359551  0.87323944  0.84180791]
0.871159156086
7
[ 0.89355742  0.87640449  0.86516854  0.87042254  0.83333333]
0.867777265044


In [82]:
pipeline = make_pipeline(CountVectorizer(min_df=2, ngram_range=(1, 2), stop_words=stop_words), TfidfTransformer(), LogisticRegression(penalty='l2', max_iter=1000, C=12))
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.92436975  0.90449438  0.91011236  0.89014085  0.87853107]
0.901529681598


In [88]:
pipeline = make_pipeline(CountVectorizer(min_df=1, ngram_range=(1, 2), stop_words=stop_words), TfidfTransformer(), LogisticRegression(penalty='l2', max_iter=1000, C=12, solver='newton-cg'))
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.93557423  0.90730337  0.90449438  0.89577465  0.88135593]
0.904900512518


In [89]:
for c in range(11, 14):
    print c
    pipeline = make_pipeline(CountVectorizer(min_df=1,
                                             ngram_range=(1, 2),
                                             stop_words=stop_words),
                                TfidfTransformer(),
                                LogisticRegression(penalty='l2',
                                                   max_iter=1000,
                                                   C=c,
                                                   solver='newton-cg'))
    arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
    print arr
    print np.mean(arr)

11
[ 0.93557423  0.90730337  0.90449438  0.89859155  0.88135593]
0.9054638928
12
[ 0.93557423  0.90730337  0.90449438  0.89577465  0.88135593]
0.904900512518
13
[ 0.93557423  0.90730337  0.90449438  0.89577465  0.88135593]
0.904900512518


In [92]:
pipeline = make_pipeline(CountVectorizer(min_df=1,
                                         ngram_range=(1, 2),
                                         stop_words=stop_words),
                            TfidfTransformer(),
                            LogisticRegression(penalty='l2',
                                               max_iter=5000,
                                               C=11,
                                               solver='newton-cg'))
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.93557423  0.90730337  0.90449438  0.89859155  0.88135593]
0.9054638928


In [97]:
pipeline = make_pipeline(CountVectorizer(min_df=1,
                                         ngram_range=(1, 2),
                                         stop_words=stop_words),
                            TfidfTransformer(),
                            LogisticRegression(penalty='l2',
                                               max_iter=5000,
                                               C=11)
                        )
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.93557423  0.90730337  0.90449438  0.89859155  0.88135593]
0.9054638928


In [106]:
algo = LogisticRegression(penalty='l2',
                            max_iter=5000,
                            C=11)

count_vectorizer = CountVectorizer(min_df=1,
                                         ngram_range=(1, 2),
                                         stop_words=stop_words)

sparse_feature_matrix = count_vectorizer.fit_transform(train_data.data)
algo.fit(sparse_feature_matrix, train_data.target)

LogisticRegression(C=11, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [107]:
algo.fit(sparse_feature_matrix, train_data.target)
W = algo.coef_.shape[1]
for c in algo.classes_:
    topic_words = [
        num_2_words[w_num]
        for w_num in heapq.nlargest(10, range(W), key=lambda w: algo.coef_[c, w])
    ]
    print ',  '.join(topic_words)

KeyError: 33185

In [110]:
for i in range(5):
    for j in range(i, 5):
        print i, j
        pipeline = make_pipeline(CountVectorizer(min_df=1,
                                             ngram_range=(i, j),
                                             stop_words='en'),
                                TfidfTransformer(),
                                LogisticRegression(penalty='l2',
                                                   max_iter=5000,
                                                   C=11,
                                                  random_state=41)
                            )
        arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
        print arr
        print np.mean(arr)

 0 0
[ 0.33333333  0.33426966  0.33426966  0.33239437  0.33333333]
0.333520071741
0 1
[ 0.92156863  0.91292135  0.91011236  0.89295775  0.88418079]
0.904348174551
0 2
[ 0.93557423  0.91853933  0.91573034  0.89577465  0.87570621]
0.908264951038
0 3
[ 0.93277311  0.91853933  0.90449438  0.88450704  0.86158192]
0.900379156053
0 4
[ 0.92717087  0.91011236  0.8988764   0.88169014  0.86158192]
0.895886338828
1 1
[ 0.92156863  0.91292135  0.91011236  0.89295775  0.88418079]
0.904348174551
1 2
[ 0.93557423  0.90730337  0.90449438  0.89859155  0.88135593]
0.9054638928
1 3
[ 0.92997199  0.92134831  0.90730337  0.89859155  0.87288136]
0.906019315883
1 4
[ 0.93277311  0.91853933  0.90730337  0.89014085  0.87288136]
0.904327601375
2 2
[ 0.80392157  0.79494382  0.78089888  0.75211268  0.77118644]
0.780612676398
2 3
[ 0.79831933  0.79494382  0.7752809   0.74929577  0.77118644]
0.777805252432
2 4


KeyboardInterrupt: 

In [112]:
pipeline = make_pipeline(CountVectorizer(min_df=1,
                                     ngram_range=(0, 2),
                                     stop_words='english',
                                    li),
                        TfidfTransformer(),
                        LogisticRegression(penalty='l2',
                                           max_iter=5000,
                                           C=11,
                                          random_state=41)
                    )
arr = cross_val_score(pipeline, train_data.data, train_data.target, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

[ 0.92717087  0.91853933  0.9241573   0.90422535  0.88700565]
0.912219699878
