Полезные ссылки
* https://habr.com/ru/post/446530/ - w2v в картинках
* https://jalammar.github.io/ - оригинал статей про архитектуры для NLP (часто там есть ссылки на перевод)
* https://huggingface.co/ - ссылка на библиотеку с большим количеством предобученных текстовых моделей
* https://stepik.org/course/111171/syllabus - отличный курс на русском про NLP
* https://web.stanford.edu/class/cs224n/ - один из самых полных и популярных в мире курсов по NLP (там есть ссылки на все лекции и домашки)

In [45]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
from gensim.models import Word2Vec
import gensim.downloader as api

In [46]:
train = fetch_20newsgroups(subset="train")
test = fetch_20newsgroups(subset="test")

In [47]:
len(train.target_names)

20

In [48]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [49]:
print(train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [50]:
train.target

array([7, 4, 4, ..., 3, 1, 8])

In [51]:
vectorizerCount = CountVectorizer()

In [52]:
x_train_count = vectorizerCount.fit_transform(train.data)

In [53]:
x_train_count

<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [54]:
x_train_count.shape

(11314, 130107)

In [55]:
index = np.random.randint(100000, size=10)

In [56]:
index

array([86760, 35324, 30963, 25417, 20615, 82641, 39206, 74619, 67325,
       79303])

In [57]:
np.array(vectorizerCount.get_feature_names())[index]

array(['netx', 'bskendigc5rbvn', 'azg6t', 'abrash', '875', 'mj75up',
       'cheesebox', 'large', 'initiate', 'mannheim'], dtype='<U180')

In [58]:
lr_count = LogisticRegression(max_iter=200, n_jobs=-1)

In [59]:
lr_count.fit(x_train_count, train.target)

LogisticRegression(max_iter=200, n_jobs=-1)

In [60]:
x_test_count = vectorizerCount.transform(test.data)

In [61]:
pred_count = lr_count.predict(x_test_count)

In [62]:
print(classification_report(test.target, pred_count))

              precision    recall  f1-score   support

           0       0.74      0.72      0.73       319
           1       0.68      0.75      0.71       389
           2       0.72      0.66      0.69       394
           3       0.66      0.68      0.67       392
           4       0.76      0.80      0.78       385
           5       0.82      0.69      0.75       395
           6       0.80      0.89      0.84       390
           7       0.81      0.84      0.83       396
           8       0.92      0.92      0.92       398
           9       0.85      0.87      0.86       397
          10       0.92      0.93      0.92       399
          11       0.91      0.86      0.88       396
          12       0.68      0.73      0.71       393
          13       0.82      0.74      0.77       396
          14       0.92      0.88      0.90       394
          15       0.84      0.93      0.88       398
          16       0.70      0.84      0.77       364
          17       0.91    

In [63]:
vectorizer_tf = TfidfVectorizer()

In [64]:
x_train_tf = vectorizer_tf.fit_transform(train.data)

In [65]:
x_train_tf.shape

(11314, 130107)

In [66]:
lr_tf = LogisticRegression(n_jobs=-1, max_iter=200)

In [67]:
lr_tf.fit(x_train_tf, train.target)

LogisticRegression(max_iter=200, n_jobs=-1)

In [68]:
x_test_tf = vectorizer_tf.transform(test.data)

In [69]:
pred_tf = lr_tf.predict(x_test_tf)

In [70]:
print(classification_report(test.target, pred_tf))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77       319
           1       0.69      0.79      0.74       389
           2       0.75      0.73      0.74       394
           3       0.72      0.72      0.72       392
           4       0.81      0.83      0.82       385
           5       0.83      0.74      0.78       395
           6       0.76      0.90      0.82       390
           7       0.90      0.89      0.90       396
           8       0.95      0.95      0.95       398
           9       0.88      0.92      0.90       397
          10       0.94      0.95      0.95       399
          11       0.94      0.88      0.91       396
          12       0.76      0.80      0.78       393
          13       0.89      0.83      0.85       396
          14       0.91      0.92      0.91       394
          15       0.81      0.94      0.87       398
          16       0.72      0.88      0.79       364
          17       0.96    

In [71]:
token_data = [text.split() for text in train.data]

In [72]:
model_wv = Word2Vec(token_data)

In [73]:
model_wv.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7fc2f44f7208>

In [75]:
def vector_sum(text, embeddings):
    embeddings_dim = embeddings.vectors.shape[1]
    features = np.zeros([embeddings_dim])
    for word in text.split():
        if word in embeddings:
            features += embeddings[f"{word}"]  
    return features

In [76]:
x_train_wv = np.stack([vector_sum(text, model_wv.wv) for text in train.data])

In [77]:
x_train_wv.shape

(11314, 100)

In [78]:
lr_wv = LogisticRegression(n_jobs=-1, max_iter=200)

In [79]:
lr_wv.fit(x_train_wv, train.target)

LogisticRegression(max_iter=200, n_jobs=-1)

In [80]:
x_test_wv = np.stack([vector_sum(text, model_wv.wv) for text in test.data])

In [81]:
pred_wv = lr_wv.predict(x_test_wv)

In [82]:
print(classification_report(test.target, pred_wv))

              precision    recall  f1-score   support

           0       0.33      0.32      0.32       319
           1       0.28      0.36      0.32       389
           2       0.35      0.30      0.32       394
           3       0.35      0.29      0.32       392
           4       0.29      0.22      0.25       385
           5       0.51      0.55      0.53       395
           6       0.48      0.84      0.61       390
           7       0.28      0.29      0.29       396
           8       0.39      0.46      0.42       398
           9       0.38      0.41      0.39       397
          10       0.55      0.50      0.52       399
          11       0.48      0.40      0.44       396
          12       0.32      0.23      0.27       393
          13       0.31      0.29      0.30       396
          14       0.38      0.38      0.38       394
          15       0.48      0.52      0.50       398
          16       0.36      0.33      0.34       364
          17       0.56    

In [83]:
embeddings_pretrain = api.load("word2vec-google-news-300")

In [84]:
x_train_wv = np.stack([vector_sum(text, embeddings_pretrain) for text in train.data])

In [85]:
lr_wv_pretrained = LogisticRegression(n_jobs=-1, max_iter=200)

In [86]:
lr_wv_pretrained.fit(x_train_wv, train.target)

LogisticRegression(max_iter=200, n_jobs=-1)

In [87]:
x_test_wv = np.stack([vector_sum(text, embeddings_pretrain) for text in test.data])

In [88]:
pred_wv = lr_wv_pretrained.predict(x_test_wv)

In [89]:
print(classification_report(test.target, pred_wv))

              precision    recall  f1-score   support

           0       0.51      0.49      0.50       319
           1       0.55      0.64      0.59       389
           2       0.57      0.56      0.56       394
           3       0.53      0.54      0.53       392
           4       0.61      0.58      0.60       385
           5       0.64      0.61      0.63       395
           6       0.72      0.75      0.74       390
           7       0.79      0.78      0.78       396
           8       0.78      0.74      0.76       398
           9       0.88      0.95      0.91       397
          10       0.95      0.93      0.94       399
          11       0.80      0.76      0.78       396
          12       0.58      0.60      0.59       393
          13       0.83      0.77      0.80       396
          14       0.85      0.80      0.82       394
          15       0.64      0.68      0.66       398
          16       0.64      0.74      0.69       364
          17       0.91    