In [170]:
import gzip
import re

import pandas as pd
import gensim
import numpy as np

from gensim.models import Word2Vec

from dataclasses import dataclass
from typing import Iterator, List

sw = open('stop_words.txt','r',encoding='utf-8')
sw = sw.read().split('\n')

@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn: str="news.txt.gz") -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))
                                  
def tokenize_text(text: str) -> List[str]:
    text = text.lower()
    words = [w for w in re.findall(r'\b\w+\b', text.lower()) if w not in sw]
    return words

def normalize_text(text: str) -> str:
    return ' '.join(tokenize_text(text))

### Векторизация с помощью word2vec

In [171]:
sentences = [tokenize_text(text.text) for text in read_texts()]

w2v = Word2Vec(sentences,vector_size=250)

w2v.wv.save_word2vec_format('w2v_vectors.bin')


### Векторизация текста путем усреднения векторов слов

In [172]:
mean_emb =[]
labels  = []
lens = []
for text in read_texts():
    labels.append(text.label)
    words_vecs = [w2v.wv[word] for word in tokenize_text(text.text) if word in w2v.wv]
    lens.append(len(words_vecs))
    mean_emb.append(np.mean(words_vecs,axis=0))
    

In [173]:
df = pd.DataFrame({'label':labels})
classes = list(set(labels))
df['label']=df['label'].apply(classes.index)


In [174]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
svc = SVC()
X_train,X_test,y_train,y_test = train_test_split(np.array(mean_emb),df['label'],test_size=0.3,random_state=42)

In [175]:
svc.fit(X_train,y_train)
svc.score(X_test,y_test)

0.687

### Добавление к векторам слов эмбеддинга длины текста

In [176]:
import torch
len_emb = torch.nn.Embedding(np.max(lens)+1,15,max_norm=True)
lens_emb = len_emb(torch.tensor(lens))
new_emb = np.concatenate((np.array(mean_emb),lens_emb.detach().numpy()),axis=1)



In [177]:
svc = SVC()
X_train,X_test,y_train,y_test = train_test_split(np.array(new_emb),df['label'],test_size=0.3,random_state=42)
svc.fit(X_train,y_train)
svc.score(X_test,y_test)

0.6803333333333333

### Взвешивание векторов слов с помощью tf-idf

In [178]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = list(read_texts("news.txt.gz"))
vectorizer = TfidfVectorizer(min_df=3,stop_words=sw)
tfidf = vectorizer.fit([normalize_text(text.text) for text in texts])

In [179]:
idfs = dict(zip(tfidf.get_feature_names_out(),list(tfidf.idf_)))
names =tfidf.get_feature_names_out()
words = list(w2v.wv.key_to_index.keys())


##### Sklearn токенизирует по своему, поэтому в словаре tf-idf отсутствуют некоторые токены. Вместо проверки на наличее слова из текста в двух словарях, один из которых может быть значительно больше другого, находится перекрытие между словарями

In [180]:
wv_uni =[]
for i in words:
    if i not in names:
        wv_uni.append(i)

In [181]:
from collections import Counter
tfidf_w2v = []
for text in read_texts():
    labels.append(text.label)
    tok_text = tokenize_text(text.text)
    words_vecs = [w2v.wv[word] for word in tok_text if word in w2v.wv and word not in wv_uni]
    idfs_w = [idfs[word] for word in tok_text if word in w2v.wv and word not in wv_uni]
    tfs = Counter(tok_text)
    tff = [tfs[word]/len(words_vecs) for word in tok_text if word in w2v.wv and word not in wv_uni]
    tf_id = (tff*np.array(idfs_w))
    norm = sum(tff*np.array(idfs_w))
    tfidf_w2v.append(np.sum((np.array(words_vecs).T*tf_id).T,axis=0)/norm)

In [182]:
np.array(tfidf_w2v).shape


(10000, 250)

In [183]:
svc = SVC()
X_train,X_test,y_train,y_test = train_test_split(np.array(tfidf_w2v),df['label'],test_size=0.3,random_state=42)

In [184]:
svc.fit(X_train,y_train)
svc.score(X_test,y_test)

0.692