In [1]:
from lxml import html
import pandas as pd
from rich import print, progress
from gensim.models import Word2Vec, FastText, KeyedVectors
from string import punctuation
from razdel import tokenize as razdel_tokenize
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from tqdm.auto import tqdm
import re
import numpy as np
from collections import Counter
from pymystem3 import Mystem
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [34]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [35]:
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))
m = Mystem()

mapping = {
     'A': 'ADJ',
     'ADV': 'ADV',
     'ADVPRO': 'ADV',
     'ANUM': 'ADJ',
     'APRO': 'DET',
     'COM': 'ADJ',
     'CONJ': 'SCONJ',
     'INTJ': 'INTJ',
     'NONLEX': 'X',
     'NUM': 'NUM',
     'PART': 'PART',
     'PR': 'ADP',
     'S': 'NOUN',
     'SPRO': 'PRON',
     'UNKN': 'X',
     'V': 'VERB'
}


def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    
    tokens = [token.text for token in list(razdel_tokenize(text))]
    tokens = [token for token in tokens if token.isalnum()]

    return ' '.join(tokens)

def normalize_mystem(text):
    tokens = []
    norm_words = m.analyze(text)
    for norm_word in norm_words:
        if 'analysis' not in norm_word:
            continue
            
        if not len(norm_word['analysis']):
            lemma = norm_word['text']
            pos = 'UNKN'
        else:
            lemma = norm_word["analysis"][0]["lex"].lower().strip()
            pos = norm_word["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
        pos = mapping[pos]
        tokens.append(lemma+'_'+pos)

    return tokens

In [3]:
with open("paraphrases.xml", 'rb') as f:
    corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
    texts_1 = []
    texts_2 = []
    classes = []

    for p in corpus_xml.xpath('//paraphrase'):
        texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
        texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
        classes.append(p.xpath('./value[@name="class"]/text()')[0])

    data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [4]:
data.label.value_counts()

0     2957
-1    2582
1     1688
Name: label, dtype: int64

# Задание №1

In [28]:
# попробуем учить эмбеддинги на дампе киррилических твитов (русский/украинский/болгарский)
!wget https://rexhaif.keybase.pub/ru-uk-bg_tweets.parquet?dl=1

--2021-02-12 08:44:50--  https://rexhaif.keybase.pub/ru-uk-bg_tweets.parquet?dl=1
Resolving rexhaif.keybase.pub (rexhaif.keybase.pub)... 52.1.81.129, 34.192.29.4, 34.204.113.10
Connecting to rexhaif.keybase.pub (rexhaif.keybase.pub)|52.1.81.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 489016724 (466M) [application/octet-stream]
Saving to: ‘ru-uk-bg_tweets.parquet?dl=1’


2021-02-12 08:51:04 (1.25 MB/s) - ‘ru-uk-bg_tweets.parquet?dl=1’ saved [489016724/489016724]



In [29]:
!mv ru-uk-bg_tweets.parquet\?dl\=1 ru-uk-bg_tweets.parquet

In [5]:
corpus = pd.read_parquet("ru-uk-bg_tweets.parquet")

In [6]:
corpus.lang.value_counts()

ru    4187232
uk     294097
bg      60794
Name: lang, dtype: int64

In [7]:
texts = []
for text in tqdm(corpus['text']):
    if "RT" not in text:
        text = re.sub(r"(?:\@|https?\://)\S+", "", text)
        text = text.replace("\n", " ").replace("\xa0", " ").strip()
        texts.append(text)

HBox(children=(FloatProgress(value=0.0, max=4542123.0), HTML(value='')))




In [8]:
norm_tweets = [normalize(tokenize(text)) for text in tqdm(texts)]

HBox(children=(FloatProgress(value=0.0, max=3344176.0), HTML(value='')))




In [9]:
print(norm_tweets[:10])

In [10]:
w2v = Word2Vec([text.split() for text in norm_tweets], size=300, sg=1, workers=48)

In [11]:
fst = FastText([text.split() for text in norm_tweets], sg=1, size=300, workers=48)

In [12]:
print(w2v.wv.most_similar("чонгук"))

In [13]:
print(fst.wv.most_similar("чонгук"))

В качетсве ещё одинх эмбеддингов используем tayga_upos_skipgram_300_2_2019

In [None]:
!wget http://vectors.nlpl.eu/repository/20/185.zip && unzip 185.zip

In [14]:
w2v_rusvec = KeyedVectors.load_word2vec_format("model.txt")

In [36]:
def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [18]:
X_own, X_rusvec = [], []
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
for row in tqdm(data.itertuples()):
    x1_own = get_embedding(normalize(tokenize(row.text_1)), w2v, 300)
    x2_own = get_embedding(normalize(tokenize(row.text_2)), w2v, 300)
    X_own.append(
        np.concatenate([
            x1_own,
            x2_own,
            np.maximum(x1_own, x2_own),
            np.add(x1_own, x2_own),
            np.subtract(x1_own, x2_own)
        ])
    )
    
    x1_rusvec = get_embedding(" ".join(normalize_mystem(row.text_1)), w2v_rusvec, 300)
    x2_rusvec = get_embedding(" ".join(normalize_mystem(row.text_2)), w2v_rusvec, 300)
    X_rusvec.append(
        np.concatenate([
            x1_rusvec,
            x2_rusvec,
            np.maximum(x1_rusvec, x2_rusvec),
            np.add(x1_rusvec, x2_rusvec),
            np.subtract(x1_rusvec, x2_rusvec)
        ])
    )

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

  v = model[word]





In [19]:
X_own = np.stack(X_own)
X_rusvec = np.stack(X_rusvec)

In [20]:
cv = StratifiedKFold(n_splits=5)
scores_own = []
for i, (train_idx, test_idx) in enumerate(cv.split(X_own, y)):
    X_train, y_train = X_own[train_idx], y[train_idx]
    X_test, y_test = X_own[train_idx], y[train_idx]
    model = LogisticRegression(C=100, max_iter=100, verbose=5, n_jobs=-1)
    model.fit(X_train, y_train)
    score = f1_score(y_test, model.predict(X_test), average='macro')
    print(f"Fold - {i}| f-score: {score:.4f}")
    scores_own.append(score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    9.8s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.1s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.1s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.0s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.2s finished


In [21]:
cv = StratifiedKFold(n_splits=5)
scores_rusvec = []
for i, (train_idx, test_idx) in enumerate(cv.split(X_rusvec, y)):
    X_train, y_train = X_rusvec[train_idx], y[train_idx]
    X_test, y_test = X_rusvec[train_idx], y[train_idx]
    model = LogisticRegression(C=100, max_iter=100, verbose=5, n_jobs=-1)
    model.fit(X_train, y_train)
    score = f1_score(y_test, model.predict(X_test), average='macro')
    print(f"Fold - {i}| f-score: {score:.4f}")
    scores_rusvec.append(score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.1s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.0s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.0s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.3s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.1s finished


In [22]:
print(f"Our embeddings: {np.mean(scores_own):.4f}+-{np.std(scores_own):.4f}")

In [23]:
print(f"RusVec embeddings: {np.mean(scores_rusvec):.4f}+-{np.std(scores_rusvec):.4f}")

# Задание №2

Учим SVD и NMF

In [24]:
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.pipeline import Pipeline

In [49]:
svd_pipeline = Pipeline(steps=[
    ('tf-idf', TfidfVectorizer(ngram_range=(1,3))),
    ('svd', TruncatedSVD(n_components=200))
])
nmf_pipeline = Pipeline(steps=[
    ('tf-idf', TfidfVectorizer(ngram_range=(1,3))),
    ('nmf', NMF(n_components=200))
])

In [50]:
train_data = pd.concat([
    data['text_1'].apply(lambda x: normalize(tokenize(x))),
    data['text_2'].apply(lambda x: normalize(tokenize(x)))
])

In [51]:
svd_pipeline.fit(train_data)
nmf_pipeline.fit(train_data)



Pipeline(steps=[('tf-idf', TfidfVectorizer(ngram_range=(1, 3))),
                ('nmf', NMF(n_components=200))])

генерим фичи(будем считать расстояния сразу, а не потом)

In [None]:
svd_distances = []
nmf_distances = []
w2v_own_distances = []
w2v_rusvec_distances = []
fasttext_distances = []
for row in tqdm(data.itertuples(), total=len(data)):
    s1 = svd_pipeline.transform([normalize(tokenize(row.text_1))])
    s2 = svd_pipeline.transform([normalize(tokenize(row.text_2))])
    svd_distances.append(cosine_similarity(s1, s2).ravel())
    
    n1 = nmf_pipeline.transform([normalize(tokenize(row.text_1))])
    n2 = nmf_pipeline.transform([normalize(tokenize(row.text_2))])
    nmf_distances.append(cosine_similarity(n1, n2).ravel())
    
    w1 = get_embedding(normalize(tokenize(row.text_1)), w2v, 300)
    w2 = get_embedding(normalize(tokenize(row.text_2)), w2v, 300)
    w2v_own_distances.append(cosine_similarity([w1], [w2]).ravel())
    
    r1 = get_embedding(" ".join(normalize_mystem(row.text_1)), w2v_rusvec, 300)
    r2 = get_embedding(" ".join(normalize_mystem(row.text_2)), w2v_rusvec, 300)
    w2v_rusvec_distances.append(cosine_similarity([w1], [w2]).ravel())
    
    f1 = get_embedding(normalize(tokenize(row.text_1)), fst, 300)
    f2 = get_embedding(normalize(tokenize(row.text_2)), fst, 300)
    fasttext_distances.append(cosine_similarity([f1], [f2]).ravel())

HBox(children=(FloatProgress(value=0.0, max=7227.0), HTML(value='')))

In [53]:
data_dist = pd.DataFrame({
    'svd': map(lambda x: x[0], svd_distances),
    'nmf': map(lambda x: x[0], nmf_distances),
    'w2v_own': map(lambda x: x[0], w2v_own_distances),
    'w2v_rusvec': map(lambda x: x[0], w2v_rusvec_distances),
    'fasttest': map(lambda x: x[0], fasttext_distances),
    'label': y
})

In [54]:
X = data_dist.loc[:, ['svd', 'nmf', 'w2v_own', 'w2v_rusvec', 'fasttest']].values
y = data_dist['label'].values

In [55]:
cv = StratifiedKFold(n_splits=5)
scores = []
for i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[train_idx], y[train_idx]
    model = LogisticRegression(C=100, max_iter=100, verbose=5, n_jobs=-1)
    model.fit(X_train, y_train)
    score = f1_score(y_test, model.predict(X_test), average='micro')
    print(f"Fold - {i}| f-score: {score:.4f}")
    scores.append(score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.7s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [56]:
print(f"Scores for distfeatures: {np.mean(scores):.4f}+-{np.std(scores):.4f}")

Пробовал крутить параметры у svd и nmf - без особого результата, значения метрик остаются примернов в таком же диапазоне