In [0]:
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in str(text).lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in str(text).lower().split()]

    return ' '.join(words)

In [0]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [14]:
%%time
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

CPU times: user 19.7 s, sys: 18.6 ms, total: 19.7 s
Wall time: 19.7 s


Подготовим для обучения тексты с семинара.

In [21]:
%%time
data_rt = pd.read_csv('news_texts.csv')
data_rt.dropna(inplace=True)

CPU times: user 1.87 s, sys: 283 ms, total: 2.15 s
Wall time: 2.15 s


In [22]:
data_rt.shape

(7212, 2)

In [24]:
%%time
data_rt['content_norm'] = data_rt['content'].apply(normalize)

CPU times: user 16min 35s, sys: 342 ms, total: 16min 35s
Wall time: 16min 35s




---


Для начала преобразуем матрицы с помощью **TfidfVectorizer** и **CountVectorizer**

In [0]:
CV = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
Tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)

In [0]:
X = CV.fit_transform(data_rt['content_norm'])

X_text_1_CV = CV.fit_transform(data['text_1_norm'])
X_text_2_CV = CV.fit_transform(data['text_2_norm'])
X_text_1_Tfidf = Tfidf.fit_transform(data['text_1_norm'])
X_text_2_Tfidf = Tfidf.fit_transform(data['text_2_norm'])

In [28]:
y = data['label'].values
print(y.shape)

(7227,)




---

Теперь начнем работу с моделями.

# SVD

In [26]:
%%time
SVD = TruncatedSVD(200)
SVD.fit(X)

CPU times: user 3.56 s, sys: 1.3 s, total: 4.85 s
Wall time: 3.1 s


In [27]:
%%time
X_text_SVD_CV = np.concatenate([SVD.transform(X_text_1_CV),
                                SVD.transform(X_text_2_CV)],
                               axis=1)

CPU times: user 28.8 ms, sys: 7 µs, total: 28.8 ms
Wall time: 32.3 ms


In [29]:
%%time
X_text_SVD_Tfidf = np.concatenate([SVD.transform(X_text_1_Tfidf),
                                   SVD.transform(X_text_2_Tfidf)],
                                  axis=1)

CPU times: user 26.1 ms, sys: 1e+03 µs, total: 27.1 ms
Wall time: 32 ms


# NMF

In [30]:
%%time
nmf = NMF(200)
nmf.fit(X)

CPU times: user 3min 4s, sys: 35.6 s, total: 3min 39s
Wall time: 2min 47s


In [32]:
%%time
X_text_NMF_CV = np.concatenate([nmf.transform(X_text_1_CV),
                                nmf.transform(X_text_2_CV)],
                               axis=1)

CPU times: user 7.3 s, sys: 1.5 s, total: 8.8 s
Wall time: 6.67 s


In [33]:
%%time
X_text_NMF_Tfidf = np.concatenate([nmf.transform(X_text_1_Tfidf),
                                   nmf.transform(X_text_2_Tfidf)],
                                  axis=1)

CPU times: user 7.6 s, sys: 1.69 s, total: 9.29 s
Wall time: 7.05 s


# FastText

In [60]:
%%time
corpus = [text.split() for text in data_rt['content'].apply(tokenize)]
FastText = gensim.models.FastText(corpus,
                                  size=50,
                                  min_n=4,
                                  max_n=8)

CPU times: user 8min 39s, sys: 1.02 s, total: 8min 40s
Wall time: 4min 30s


In [61]:
%%time
FastTextNorm = gensim.models.FastText([text.split() for text in data_rt['content_norm']],
                                      size=50,
                                      min_n=4,
                                      max_n=8)

CPU times: user 7min 30s, sys: 857 ms, total: 7min 31s
Wall time: 3min 51s


In [0]:
def get_embedding(text, model, dim):
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total)
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [162]:
%%time
dim = 50

data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, FastText, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, FastText, dim)

  if __name__ == '__main__':


CPU times: user 3.36 s, sys: 48.3 ms, total: 3.41 s
Wall time: 3.37 s


In [0]:
X_text_FT = np.concatenate([X_text_1_ft, X_text_2_ft], axis=1)

In [163]:
%%time
X_text_1_ft_norm = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_norm = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_norm[i] = get_embedding(text, FastTextNorm, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_norm[i] = get_embedding(text, FastTextNorm, dim)

  if __name__ == '__main__':


CPU times: user 2.19 s, sys: 12 ms, total: 2.2 s
Wall time: 2.19 s


In [0]:
X_text_FT_NORM = np.concatenate([X_text_1_ft, X_text_2_ft], axis=1)

# Word2Vec

In [46]:
%%time
Word2Vec = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']],
                                  size=50,
                                  sg=1)

CPU times: user 4min 39s, sys: 404 ms, total: 4min 39s
Wall time: 2min 21s


In [70]:
%%time
dim = 50

X_text_1_W2V = np.zeros((len(data['text_1_norm']), dim))
X_text_2_W2V = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_W2V[i] = get_embedding(text, Word2Vec, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_W2V[i] = get_embedding(text, Word2Vec, dim)

  if __name__ == '__main__':


CPU times: user 1.72 s, sys: 35.4 ms, total: 1.76 s
Wall time: 1.72 s


In [0]:
X_text_W2V = np.concatenate([X_text_1_W2V, X_text_2_W2V], axis=1)



---

# Word2vec и FastText с TfIdf

In [121]:
%%time
Tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)

Tfidf_matrix = Tfidf.fit_transform(data['text_1_norm'])
feature_names = Tfidf.get_feature_names()

CPU times: user 106 ms, sys: 2.01 ms, total: 108 ms
Wall time: 110 ms


In [0]:
def get_score(doc, matrix, features):
  d = {}
  feature_index = matrix[doc,:].nonzero()[1]
  tfidf_scores = zip(feature_index, [matrix[doc, x] for x in feature_index])
  for w, s in [(features[i], s) for (i, s) in tfidf_scores]:
    d[w] = s
  return d

def get_embedding_tfidf(text, model, dim, num):
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    scores = get_score(num, Tfidf_matrix, feature_names)
    
    for i, word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*scores[word]
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [129]:
%%time
dim = 50

X_text_1_W2V_TfIdf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_W2V_TfIdf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
  X_text_1_W2V_TfIdf[i] = get_embedding_tfidf(text, Word2Vec, dim, i)
    
for i, text in enumerate(data['text_2_norm'].values):
  X_text_2_W2V_TfIdf[i] = get_embedding_tfidf(text, Word2Vec, dim, i)



CPU times: user 8.34 s, sys: 7.72 ms, total: 8.35 s
Wall time: 8.35 s


In [0]:
X_text_W2V_TfIdf = np.concatenate([X_text_1_W2V, X_text_2_W2V], axis=1)

In [169]:
%%time
dim = 50

X_text_1_ft_tfidf = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft_tfidf = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft_tfidf[i] = get_embedding_tfidf(text, FastText, dim, i)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft_tfidf[i] = get_embedding_tfidf(text, FastText, dim, i)

CPU times: user 9.58 s, sys: 12.7 ms, total: 9.6 s
Wall time: 9.59 s


In [0]:
X_text_FT_TfIdf = np.concatenate([X_text_1_ft_tfidf, X_text_2_ft_tfidf], axis=1)

In [170]:
%%time
dim = 50

X_text_1_ft_NORM_tfidf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_NORM_tfidf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_NORM_tfidf[i] = get_embedding_tfidf(text, FastTextNorm, dim, i)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_NORM_tfidf[i] = get_embedding_tfidf(text, FastTextNorm, dim, i)



CPU times: user 8.92 s, sys: 10.7 ms, total: 8.93 s
Wall time: 8.92 s


In [0]:
X_text_W2V_NORM_TfIdf = np.concatenate([X_text_1_ft_tfidf, X_text_2_ft_tfidf], axis=1)



---

Все вектора построенные, теперь начинаем считать косинусную близость...
# Косинусная близость текстов

Создадим датафрейм, в котором будем хранить всю информацию.


In [0]:
research = data[['text_1','text_2']]

In [0]:
from sklearn.metrics.pairwise import cosine_similarity as cs

In [0]:
research['FastText'] = [cs([X_text_1_ft[i]],
                           [X_text_2_ft[i]])[0, 0] for i in range(len(X_text_1_ft))]
research['FastText + norm'] = [cs([X_text_1_ft_norm[i]],
                                  [X_text_2_ft_norm[i]])[0, 0] for i in range(len(X_text_1_ft_norm))]
research['FastText + tfidf'] = [cs([X_text_1_ft_tfidf[i]],
                                   [X_text_2_ft_tfidf[i]])[0, 0] for i in range(len(X_text_1_ft_tfidf))]
research['FastText + norm + tfidf'] = [cs([X_text_1_ft_NORM_tfidf[i]],
                                          [X_text_2_ft_NORM_tfidf[i]])[0, 0] for i in range(len(X_text_1_ft_NORM_tfidf))]

In [0]:
research['Word2Vec'] = [cs([X_text_1_W2V[i]],
                           [X_text_2_W2V[i]])[0, 0] for i in range(len(X_text_1_W2V))]
research['Word2Vec + tfidf'] = [cs([X_text_1_W2V_TfIdf[i]],
                                   [X_text_2_W2V_TfIdf[i]])[0, 0] for i in range(len(X_text_1_W2V_TfIdf))]

In [181]:
%%time

X_nmf_1_CV = nmf.transform(X_text_1_CV)
X_nmf_2_CV = nmf.transform(X_text_2_CV)

X_nmf_1_TfIDF = nmf.transform(X_text_1_Tfidf)
X_nmf_2_TfIDF = nmf.transform(X_text_2_Tfidf)

X_SVD_1_CV = SVD.transform(X_text_1_CV)
X_SVD_2_CV = SVD.transform(X_text_2_CV)

X_SVD_1_TfIDF = SVD.transform(X_text_1_Tfidf)
X_SVD_2_TfIDF = SVD.transform(X_text_2_Tfidf)

CPU times: user 15.2 s, sys: 3.15 s, total: 18.4 s
Wall time: 14 s


In [0]:
research['NMF + CountVect'] = [cs([X_nmf_1_CV[i]],
                                  [X_nmf_2_CV[i]])[0, 0] for i in range(len(X_nmf_2_CV))]
research['NMF + TfIDF'] = [cs([X_nmf_1_TfIDF[i]],
                              [X_nmf_2_TfIDF[i]])[0, 0] for i in range(len(X_nmf_1_TfIDF))]
research['SVD + CountVect'] = [cs([X_SVD_1_CV[i]],
                                  [X_SVD_2_CV[i]])[0, 0] for i in range(len(X_SVD_2_CV))]
research['SVD + TfIDF'] = [cs([X_SVD_1_TfIDF[i]],
                              [X_SVD_2_TfIDF[i]])[0, 0] for i in range(len(X_SVD_1_TfIDF))]

In [183]:
research

Unnamed: 0,text_1,text_2,FastText,FastText + norm,FastText + tfidf,FastText + norm + tfidf,Word2Vec,Word2Vec + tfidf,NMF + CountVect,NMF + TfIDF,SVD + CountVect,SVD + TfIDF
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0.903937,0.748972,0.000000,0.719774,0.916304,0.746971,0.033658,0.027016,0.036417,0.035394
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0.908481,0.796095,0.000000,0.515567,0.914820,0.624405,0.017846,0.010989,0.065024,0.035315
2,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0.947987,0.843157,0.301490,0.685917,0.957996,0.919444,0.003285,0.004981,-0.001936,-0.000580
3,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.806719,0.624598,0.000000,0.792143,0.735428,0.874066,0.000135,0.000144,0.017314,0.005992
4,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.415907,0.680341,1.000000,0.924508,0.916776,0.953417,0.002878,0.002725,0.017767,0.004108
5,Приставы соберут отпечатки пальцев российских ...,Приставы снимут отпечатки пальцев у злостных н...,0.043207,0.780682,0.000000,0.000000,0.929455,0.000000,0.054164,0.054164,-0.014344,-0.014344
6,На саратовского дебошира с борта самолета Моск...,Саратовский дебошир отказывается возвращаться ...,-0.092899,0.440541,0.000000,0.000000,0.800890,0.000000,0.000000,0.000000,0.018371,0.021710
7,ЦИК хочет отказаться от электронной системы по...,ЦИК может отказаться от электронных средств по...,0.966421,0.882176,1.000000,0.712880,0.975740,0.930758,0.013333,0.017608,0.069326,0.070226
8,Суд Петербурга оставил на потом дело о гибели ...,Лондонский Гайд-парк - это не место для митинг...,-0.225135,0.163077,0.000000,0.000000,0.683256,0.000000,0.059383,0.052727,0.081142,0.081341
9,Страны ОПЕК сократили добычу нефти на 1 млн ба...,Обама продлил полномочия НАСА по сотрудничеств...,-0.162089,0.107350,0.000000,0.000000,0.671823,0.000000,0.002369,0.005728,-0.036375,-0.050874




---

Теперь перейдем к обучению на полученных векторах. Ради интереса используем FastText без всего. 

In [184]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/19/ea/d6cbdf03fb1e8ea8c7c7a0b37b89c8f8b3825ec625d428d49b6230656c00/catboost-0.11.1-cp36-none-manylinux1_x86_64.whl (55.3MB)
[K    100% |████████████████████████████████| 55.3MB 684kB/s 
Collecting enum34 (from catboost)
  Downloading https://files.pythonhosted.org/packages/af/42/cb9355df32c69b553e72a2e28daee25d1611d2c0d9c272aa1d34204205b2/enum34-1.1.6-py3-none-any.whl
Installing collected packages: enum34, catboost
Successfully installed catboost-0.11.1 enum34-1.1.6


In [0]:
from sklearn.model_selection import cross_val_score
import catboost

y = data.label.map({'0': 1, '1': 2, '-1': 0})

In [0]:
%%time
scores = cross_val_score(catboost.CatBoostClassifier(),
                             X_text_FT,
                             y,
                             cv=3)

print('CatBoost + FastText\nScores:{}\nMean:{}'.format(str(scores), np.mean(scores)))

0:	learn: 0.6895626	total: 93.6ms	remaining: 1m 33s
1:	learn: 0.6861287	total: 186ms	remaining: 1m 32s
2:	learn: 0.6831140	total: 281ms	remaining: 1m 33s
3:	learn: 0.6800951	total: 377ms	remaining: 1m 33s
4:	learn: 0.6773457	total: 470ms	remaining: 1m 33s
5:	learn: 0.6750456	total: 563ms	remaining: 1m 33s
6:	learn: 0.6717998	total: 662ms	remaining: 1m 33s
7:	learn: 0.6688699	total: 758ms	remaining: 1m 33s
8:	learn: 0.6658385	total: 854ms	remaining: 1m 34s
9:	learn: 0.6629518	total: 995ms	remaining: 1m 38s
10:	learn: 0.6605288	total: 1.15s	remaining: 1m 43s
11:	learn: 0.6581699	total: 1.31s	remaining: 1m 48s
12:	learn: 0.6554849	total: 1.48s	remaining: 1m 52s
13:	learn: 0.6532579	total: 1.63s	remaining: 1m 54s
14:	learn: 0.6510840	total: 1.79s	remaining: 1m 57s
15:	learn: 0.6484547	total: 1.95s	remaining: 1m 59s
16:	learn: 0.6466164	total: 2.1s	remaining: 2m 1s
17:	learn: 0.6448134	total: 2.26s	remaining: 2m 3s
18:	learn: 0.6424351	total: 2.42s	remaining: 2m 4s
19:	learn: 0.6400345	tota