In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymystem3 import Mystem
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
from lxml import objectify
xml = objectify.parse('paraphrases.xml')
root = xml.getroot()
paraphrase=[]
for i in range(len(root.corpus.getchildren())):
    paraphrase.append([child.text for child in root.corpus.getchildren()[i].getchildren()])
df=pd.DataFrame(paraphrase)

In [3]:
df.drop([0,1,2],axis = 1,inplace=True)

In [4]:
df = df.rename(columns={3: 'text_1', 4: 'text_2',5:'jaccard_ind',6:'target'})
df

Unnamed: 0,text_1,text_2,jaccard_ind,target
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0.65,0
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0.5,0
2,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0.611429,0
3,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.324037,-1
4,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0.606218,0
...,...,...,...,...
7222,Путин освободил от должности ряд генералов,Путин снял с должностей более 20 руководителей...,0.4624999999999999,0
7223,Облака над Москвой в День Победы разгонят девя...,Путеводитель по Дню Победы: как провести 9 мая...,0.45714285714285713,-1
7224,Любляна отпразднует День Победы вместе с Москвой,В Москве ограничат движение в связи с Днем Победы,0.5842373946721772,-1
7225,Девять самолетов ВВС разгонят облака над Москв...,В Москве ограничат движение в связи с Днем Победы,0.46188021535170065,-1


In [5]:
df['target'].unique()

array(['0', '-1', '1'], dtype=object)

In [6]:
df.groupby('target')['target'].count()

target
-1    2582
0     2957
1     1688
Name: target, dtype: int64

In [7]:
df['target'] = df['target'].astype('int')
df['jaccard_ind'] = df['jaccard_ind'].astype('float')

In [8]:
pd.options.display.max_colwidth = 100

In [9]:
df['target'] = np.where(df['target'] < 0, 0, 1)

In [10]:
df['jaccard_ind_res'] = np.where(df['jaccard_ind'] < 0.5, 0, 1)


In [11]:
df_score = pd.DataFrame(columns=['model','roc_auc','accuracy','precision','recall','f_1'])

In [12]:
# data frame with different decisions and scores
def res_valuation(model_name,target,predict,predict_proba,df=df_score):
    df_score.loc[df_score.shape[0]]=[model_name,
                      round(roc_auc_score(target,predict_proba),2),
                      round(accuracy_score(target,predict),2),
                      round(precision_score(target,predict),2),
                      round(recall_score(target,predict),2),
                      round(f1_score(target,predict),2)]
    
    return df_score


#### оценка по  индексу Жаккарда

In [13]:
res_valuation('jaccard',df['target'],df['jaccard_ind_res'],df['jaccard_ind'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85


## Preprocess

In [14]:
russian_stopwords = set(stopwords.words('russian'))

In [15]:
#function which delete all non-letters symbols
def delete_non_letters(words):
    new_words = []
    words = words.split()
    
    for word in words:
        new_word = "".join(c if c.isalpha() else " " for c in word )
        
        if new_word != '':
            new_words.append(new_word)
    text = ' '.join(c for c in new_words)
        
    return text

In [16]:
#delete stop-words which are in the list"russian_stopwords"
def delete_stopwords(text): 
    text = text.split()
    text = [w for w in text if not w in russian_stopwords and len(w) >= 3]
    text = " ".join(text)
    
    return text

In [17]:
#text lemmatization with MySTem. In order to make it work faster we need to join all comments with the "|" delimeter
#and than split by that symbol

def lemmatize(text): 
    text =  [i for i in text]
    m = Mystem()
    merged_text = "|".join(text)

    doc = []
    res = []
    count = 0
    lemma = m.lemmatize(merged_text)
    for t in lemma:
        
        if '|' not in t and count+1<len(lemma):
            doc.append(t)
            count+=1
          
        else:
            doc = ''.join(i for i in doc)
            res.append(doc)
            count+=1
            doc = []
    return res

In [18]:
df['cleaned_text_1'] = df['text_1'].map(lambda x: delete_stopwords(x))
df['cleaned_text_1'] = df['cleaned_text_1'].map(lambda x: delete_non_letters(x))
df['lemma_text_1'] = lemmatize(df['cleaned_text_1'])

In [19]:
df['cleaned_text_2'] = df['text_2'].map(lambda x: delete_stopwords(x))
df['cleaned_text_2'] = df['cleaned_text_2'].map(lambda x: delete_non_letters(x))
df['lemma_text_2'] = lemmatize(df['cleaned_text_2'])

In [20]:
df

Unnamed: 0,text_1,text_2,jaccard_ind,target,jaccard_ind_res,cleaned_text_1,lemma_text_1,cleaned_text_2,lemma_text_2
0,Полицейским разрешат стрелять на поражение по гражданам с травматикой.,Полиции могут разрешить стрелять по хулиганам с травматикой.,0.650000,1,1,Полицейским разрешат стрелять поражение гражданам травматикой,полицейский разрешать стрелять поражение гражданин травматика,Полиции могут разрешить стрелять хулиганам травматикой,полиция мочь разрешать стрелять хулиган травматика
1,Право полицейских на проникновение в жилище решили ограничить.,Правила внесудебного проникновения полицейских в жилище уточнят.,0.500000,1,1,Право полицейских проникновение жилище решили ограничить,право полицейский проникновение жилище решать ограничивать,Правила внесудебного проникновения полицейских жилище уточнят,правило внесудебный проникновение полицейский жилище уточнять
2,Президент Египта ввел чрезвычайное положение в мятежных городах.,Власти Египта угрожают ввести в стране чрезвычайное положение.,0.611429,1,1,Президент Египта ввел чрезвычайное положение мятежных городах,президент египет вводить чрезвычайный положение мятежный город,Власти Египта угрожают ввести стране чрезвычайное положение,власть египет угрожать вводить страна чрезвычайный положение
3,Вернувшихся из Сирии россиян волнует вопрос трудоустройства на родине.,Самолеты МЧС вывезут россиян из разрушенной Сирии.,0.324037,0,0,Вернувшихся Сирии россиян волнует вопрос трудоустройства родине,вернуться сирия россиянин волновать вопрос трудоустройство родина,Самолеты МЧС вывезут россиян разрушенной Сирии,самолет мчс вывозить россиянин разрушать сирия
4,В Москву из Сирии вернулись 2 самолета МЧС с россиянами на борту.,Самолеты МЧС вывезут россиян из разрушенной Сирии.,0.606218,1,1,Москву Сирии вернулись самолета МЧС россиянами борту,москва сирия вернуться самолет мчс россиянин борт,Самолеты МЧС вывезут россиян разрушенной Сирии,самолет мчс вывозить россиянин разрушать сирия
...,...,...,...,...,...,...,...,...,...
7222,Путин освободил от должности ряд генералов,Путин снял с должностей более 20 руководителей-силовиков,0.462500,1,0,Путин освободил должности ряд генералов,путин освобождать должность ряд генерал,Путин снял должностей руководителей силовиков,путин снимать должность руководитель силовик
7223,Облака над Москвой в День Победы разгонят девять самолетов,Путеводитель по Дню Победы: как провести 9 мая в Москве,0.457143,0,0,Облака Москвой День Победы разгонят девять самолетов,облако москва день победа разгонять девять самолет,Путеводитель Дню Победы провести мая Москве,путеводитель день победа проводить май москва
7224,Любляна отпразднует День Победы вместе с Москвой,В Москве ограничат движение в связи с Днем Победы,0.584237,0,1,Любляна отпразднует День Победы вместе Москвой,любляна отпраздновать день победа вместе москва,Москве ограничат движение связи Днем Победы,москва ограничивать движение связь день победа
7225,Девять самолетов ВВС разгонят облака над Москвой в День Победы,В Москве ограничат движение в связи с Днем Победы,0.461880,0,0,Девять самолетов ВВС разгонят облака Москвой День Победы,девять самолет ввс разгонять облако москва день победа,Москве ограничат движение связи Днем Победы,москва ограничивать движение связь день победа


## Spacy

In [21]:
import spacy
nlp = spacy.load('ru_core_news_lg')


In [22]:
# spacy_sim on text
spacy_sim=[]
for i in range(df.shape[0]):    
    spacy_sim.append(nlp(df['text_1'][i]).similarity(nlp(df['text_2'][i])))
df['spacy_sim']=spacy_sim

  spacy_sim.append(nlp(df['text_1'][i]).similarity(nlp(df['text_2'][i])))


In [23]:
df['spacy_sim_pred'] = np.where(df['spacy_sim'] < 0.5, 0, 1)

In [24]:
res_valuation('spacy_text',df['target'],df['spacy_sim_pred'],df['spacy_sim'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81


In [25]:
# spacy_sim on lemma_text
spacy_sim_lemma=[]
for i in range(df.shape[0]):    
    spacy_sim_lemma.append(nlp(df['lemma_text_1'][i]).similarity(nlp(df['lemma_text_2'][i])))
df['spacy_sim_lemma']=spacy_sim_lemma

In [26]:
df['spacy_sim_lemma_pred'] = np.where(df['spacy_sim_lemma'] < 0.5, 0, 1)

In [27]:
res_valuation('spacy_lemma_text',df['target'],df['spacy_sim_lemma_pred'],df['spacy_sim_lemma'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84


## cos_similarity , CountVectorizer, TfidfVectorizer

In [28]:
cvec_1 = CountVectorizer()

tfid_1 = TfidfVectorizer()


In [29]:
cvec_representation_1 = cvec_1.fit_transform(df['lemma_text_1'])
cvec_representation_2 = cvec_1.transform(df['lemma_text_2'])
tfid_representation_1 = tfid_1.fit_transform(df['lemma_text_1'])
tfid_representation_2 = tfid_1.transform(df['lemma_text_2'])

In [30]:
temp=[]
for i in range(df.shape[0]):
    temp.append(cosine_similarity(cvec_representation_1[i],cvec_representation_2[i]))

df['cos_sim_cv']=temp
df['cos_sim_cv_pred']=[ 0 if i<0.5 else 1 for i in df['cos_sim_cv'] ]


In [31]:
temp=[]
for i in range(df.shape[0]):
    temp.append(cosine_similarity(tfid_representation_1[i],tfid_representation_2[i]))

df['cos_sim_tf']=temp



In [32]:
df['cos_sim_tf_pred']=[ 0 if i<0.5 else 1 for i in df['cos_sim_tf'] ]

In [33]:
res_valuation('cv_cos_sim',df['target'],df['cos_sim_cv_pred'],df['cos_sim_cv'])


Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83


In [34]:
res_valuation('tf_cos_sim',df['target'],df['cos_sim_tf_pred'],df['cos_sim_tf'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82


In [35]:
v = CountVectorizer()


In [36]:
x1 = v.fit_transform(df['lemma_text_1'])
x2 = v.transform(df['lemma_text_2'])




In [37]:
x1.shape,x2.shape

((7227, 7985), (7227, 7985))

In [38]:
import scipy.sparse as sp
x1 = sp.csr_matrix(x1)
x2 = sp.csr_matrix(x2)
x = sp.hstack((x1,x2))


In [39]:

x.shape

(7227, 15970)

In [40]:


x_train, x_test, y_train, y_test = train_test_split(x, df['target'], test_size=0.2)

lr_1 = LogisticRegression()
lr_1.fit(x_train, y_train)


LogisticRegression()

In [41]:
lr_1.predict_proba(x_test)[1]

array([0.97696976, 0.02303024])

In [42]:
res_valuation('cv_log', y_test,lr_1.predict(x_test),lr_1.predict_proba(x_test)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77


In [43]:
x.shape

(7227, 15970)

In [44]:
from sklearn.decomposition import TruncatedSVD
              
svd = TruncatedSVD(n_components=500, n_iter=5, random_state=42)
svd.fit(x.T)


TruncatedSVD(n_components=500, random_state=42)

In [45]:
x_new=svd.components_.T
x_new.shape

(7227, 500)

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x_new, df['target'], test_size=0.2)

lr_2 = LogisticRegression()
lr_2.fit(x_train, y_train)

LogisticRegression()

In [47]:
res_valuation('cv_trunc_log', y_test,lr_2.predict(x_test),lr_2.predict_proba(x_test)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79


In [48]:
clf_gb_1 = GradientBoostingClassifier()

In [49]:
clf_gb_1.fit(x_train, y_train)

GradientBoostingClassifier()

In [50]:
res_valuation( 'cv_trunc_gbc',y_test,clf_gb_1.predict(x_test),clf_gb_1.predict_proba(x_test)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79


## Doc2Vec

In [51]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize



In [53]:
data = [i for i in df['text_1']]
for i in df['text_2']:
    data.append(i)

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [54]:
tagged_data

[TaggedDocument(words=['полицейским', 'разрешат', 'стрелять', 'на', 'поражение', 'по', 'гражданам', 'с', 'травматикой', '.'], tags=['0']),
 TaggedDocument(words=['право', 'полицейских', 'на', 'проникновение', 'в', 'жилище', 'решили', 'ограничить', '.'], tags=['1']),
 TaggedDocument(words=['президент', 'египта', 'ввел', 'чрезвычайное', 'положение', 'в', 'мятежных', 'городах', '.'], tags=['2']),
 TaggedDocument(words=['вернувшихся', 'из', 'сирии', 'россиян', 'волнует', 'вопрос', 'трудоустройства', 'на', 'родине', '.'], tags=['3']),
 TaggedDocument(words=['в', 'москву', 'из', 'сирии', 'вернулись', '2', 'самолета', 'мчс', 'с', 'россиянами', 'на', 'борту', '.'], tags=['4']),
 TaggedDocument(words=['приставы', 'соберут', 'отпечатки', 'пальцев', 'российских', 'должников', '.'], tags=['5']),
 TaggedDocument(words=['на', 'саратовского', 'дебошира', 'с', 'борта', 'самолета', 'москва', '-', 'хургада', 'заведено', 'дело', '.'], tags=['6']),
 TaggedDocument(words=['цик', 'хочет', 'отказаться', 'от'

In [55]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=100)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [86]:
D2v_1 = [model.infer_vector(i[0]) for i in tagged_data[:7227]]

In [87]:
D2v_2 = [model.infer_vector(i[0]) for i in tagged_data[7227:]]

In [88]:
res=[]
for i in range(df.shape[0]):
    res.append(*cosine_similarity(D2v_1[i].reshape(1,100),D2v_2[i].reshape(1,100)))

In [89]:
df['d2vs']=[1 if i>0.50 else 0 for i in res]

In [90]:
res_valuation('d2v_cos_sim',df['target'],df['d2vs'], res)

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78


In [91]:
df['text']=df['text_1'] + '-' + df['text_2']

In [92]:
# sum 2 text_vectors for using classification
sum_v=np.concatenate((D2v_1, D2v_2), axis=1)

In [93]:
sum_v.shape

(7227, 200)

In [94]:
test_data=[]
for i in df['text']:
    test_data.append(word_tokenize(i.lower()))


In [95]:
temp=[]
for i in test_data:
    temp.append(model.infer_vector(i))
    
df['doc2v'] = temp
df

Unnamed: 0,text_1,text_2,jaccard_ind,target,jaccard_ind_res,cleaned_text_1,lemma_text_1,cleaned_text_2,lemma_text_2,spacy_sim,spacy_sim_pred,spacy_sim_lemma,spacy_sim_lemma_pred,cos_sim_cv,cos_sim_cv_pred,cos_sim_tf,cos_sim_tf_pred,d2vs,text,doc2v
0,Полицейским разрешат стрелять на поражение по гражданам с травматикой.,Полиции могут разрешить стрелять по хулиганам с травматикой.,0.650000,1,1,Полицейским разрешат стрелять поражение гражданам травматикой,полицейский разрешать стрелять поражение гражданин травматика,Полиции могут разрешить стрелять хулиганам травматикой,полиция мочь разрешать стрелять хулиган травматика,0.796822,1,0.697469,1,[[0.5000000000000001]],1,[[0.5842237245699787]],1,1,Полицейским разрешат стрелять на поражение по гражданам с травматикой.-Полиции могут разрешить с...,"[0.0581856, -0.051136095, -0.014996914, -0.44138926, -0.045330986, -0.56590533, -0.14393002, 0.4..."
1,Право полицейских на проникновение в жилище решили ограничить.,Правила внесудебного проникновения полицейских в жилище уточнят.,0.500000,1,1,Право полицейских проникновение жилище решили ограничить,право полицейский проникновение жилище решать ограничивать,Правила внесудебного проникновения полицейских жилище уточнят,правило внесудебный проникновение полицейский жилище уточнять,0.714320,1,0.743022,1,[[0.6123724356957946]],1,[[0.6469904675480641]],1,1,Право полицейских на проникновение в жилище решили ограничить.-Правила внесудебного проникновени...,"[-0.10381894, 0.22946109, 0.21973428, -0.36958352, 0.018128833, -0.274807, 0.04829131, 0.1189094..."
2,Президент Египта ввел чрезвычайное положение в мятежных городах.,Власти Египта угрожают ввести в стране чрезвычайное положение.,0.611429,1,1,Президент Египта ввел чрезвычайное положение мятежных городах,президент египет вводить чрезвычайный положение мятежный город,Власти Египта угрожают ввести стране чрезвычайное положение,власть египет угрожать вводить страна чрезвычайный положение,0.793596,1,0.806203,1,[[0.5714285714285713]],1,[[0.5904189126602305]],1,1,Президент Египта ввел чрезвычайное положение в мятежных городах.-Власти Египта угрожают ввести в...,"[0.20481613, 0.16331099, 0.21121335, -0.13888514, 0.23566183, -0.16738516, 0.04084618, -0.176196..."
3,Вернувшихся из Сирии россиян волнует вопрос трудоустройства на родине.,Самолеты МЧС вывезут россиян из разрушенной Сирии.,0.324037,0,0,Вернувшихся Сирии россиян волнует вопрос трудоустройства родине,вернуться сирия россиянин волновать вопрос трудоустройство родина,Самолеты МЧС вывезут россиян разрушенной Сирии,самолет мчс вывозить россиянин разрушать сирия,0.462559,0,0.436744,0,[[0.3086066999241838]],0,[[0.2000310871956631]],0,1,Вернувшихся из Сирии россиян волнует вопрос трудоустройства на родине.-Самолеты МЧС вывезут росс...,"[-0.068795435, 0.041038956, 0.54666865, 0.058622558, -0.12085708, -0.73325324, 0.52699363, 0.625..."
4,В Москву из Сирии вернулись 2 самолета МЧС с россиянами на борту.,Самолеты МЧС вывезут россиян из разрушенной Сирии.,0.606218,1,1,Москву Сирии вернулись самолета МЧС россиянами борту,москва сирия вернуться самолет мчс россиянин борт,Самолеты МЧС вывезут россиян разрушенной Сирии,самолет мчс вывозить россиянин разрушать сирия,0.337948,0,0.705540,1,[[0.6172133998483676]],1,[[0.5172748161454573]],1,1,В Москву из Сирии вернулись 2 самолета МЧС с россиянами на борту.-Самолеты МЧС вывезут россиян и...,"[0.032162495, 0.44113573, 0.595163, 0.1368685, 0.19821404, -0.7087577, 0.558993, 1.1256306, 0.16..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7222,Путин освободил от должности ряд генералов,Путин снял с должностей более 20 руководителей-силовиков,0.462500,1,0,Путин освободил должности ряд генералов,путин освобождать должность ряд генерал,Путин снял должностей руководителей силовиков,путин снимать должность руководитель силовик,0.641312,1,0.709769,1,[[0.39999999999999997]],0,[[0.36815039428860385]],0,1,Путин освободил от должности ряд генералов-Путин снял с должностей более 20 руководителей-силовиков,"[-0.29766273, 0.07854977, -0.07667667, 0.23007677, -0.053409744, -0.31060544, -0.03411094, -0.52..."
7223,Облака над Москвой в День Победы разгонят девять самолетов,Путеводитель по Дню Победы: как провести 9 мая в Москве,0.457143,0,0,Облака Москвой День Победы разгонят девять самолетов,облако москва день победа разгонять девять самолет,Путеводитель Дню Победы провести мая Москве,путеводитель день победа проводить май москва,0.539856,1,0.650112,1,[[0.4629100498862757]],0,[[0.29796550102136143]],0,1,Облака над Москвой в День Победы разгонят девять самолетов-Путеводитель по Дню Победы: как прове...,"[-0.4576621, -0.504284, 0.1928194, -0.37246647, 0.3203741, -0.3280105, -0.18616603, 0.28312334, ..."
7224,Любляна отпразднует День Победы вместе с Москвой,В Москве ограничат движение в связи с Днем Победы,0.584237,0,1,Любляна отпразднует День Победы вместе Москвой,любляна отпраздновать день победа вместе москва,Москве ограничат движение связи Днем Победы,москва ограничивать движение связь день победа,0.461033,0,0.690240,1,[[0.5000000000000001]],1,[[0.3101614910972629]],0,1,Любляна отпразднует День Победы вместе с Москвой-В Москве ограничат движение в связи с Днем Победы,"[0.07742669, -0.043915123, -0.005536065, -0.5162239, -0.027976228, -0.6582189, -0.30212346, 0.16..."
7225,Девять самолетов ВВС разгонят облака над Москвой в День Победы,В Москве ограничат движение в связи с Днем Победы,0.461880,0,0,Девять самолетов ВВС разгонят облака Москвой День Победы,девять самолет ввс разгонять облако москва день победа,Москве ограничат движение связи Днем Победы,москва ограничивать движение связь день победа,0.395425,0,0.610246,1,[[0.4330127018922194]],0,[[0.2840240163778017]],0,1,Девять самолетов ВВС разгонят облака над Москвой в День Победы-В Москве ограничат движение в свя...,"[0.21652974, 0.20880035, 0.31230062, -0.57237566, -0.18209355, -0.3683787, -0.7250273, 0.1307776..."


In [96]:

clf_gb_2 = GradientBoostingClassifier()

In [97]:

train_features, test_features, train_labels, test_labels = train_test_split(sum_v, df['target'])

In [98]:
res=[]
t=[]
for i in train_features:
    res=[]
    for j in i:
        res.append(j)
    t.append(res)
train_features=pd.DataFrame(t)
train_features.shape

(5420, 200)

In [99]:
clf_gb_2.fit(train_features,train_labels)

GradientBoostingClassifier()

In [100]:
res=[]
t=[]
for i in test_features:
    res=[]
    for j in i:
        res.append(j)
    t.append(res)
test_features=pd.DataFrame(t)
test_features.shape

(1807, 200)

In [101]:
res_valuation('d2v_text_gbc',test_labels,clf_gb_2.predict(test_features),clf_gb_2.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [102]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


cls_log = LogisticRegression()


param_grid = {
    #'penalty': ['l1', 'l2', 'elasticnet'],    
    'fit_intercept': [True,False],    
    'tol':[1e-4,1e-3,1e-2],#Precision of the solution.
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']   
}


cv = GridSearchCV(
    estimator=cls_log,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=4,
    n_jobs=-1,
)

In [103]:
cv.fit(train_features,train_labels)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'fit_intercept': [True, False],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga'],
                         'tol': [0.0001, 0.001, 0.01]},
             scoring='roc_auc', verbose=4)

In [104]:
log_2=cv.best_estimator_.fit(train_features,train_labels)

In [105]:
res_valuation('d2v_text_log',test_labels,log_2.predict(test_features),log_2.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [106]:
res=[]
t=[]
for i in df['doc2v']:
    res=[]
    for j in i:
        res.append(j)
    t.append(res)
x = pd.DataFrame(t)
test_features.shape

(1807, 200)

In [107]:
train_features, test_features, train_labels, test_labels = train_test_split(x, df['target'])

In [108]:
lr_clf = LogisticRegression()

In [109]:
lr_clf.fit(train_features,train_labels)

LogisticRegression()

In [110]:
res_valuation('d2v_text_sum_log',test_labels,lr_clf.predict(test_features),lr_clf.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


## Bert

In [25]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.6 MB/s eta 0:00:01
Collecting torchvision
  Downloading torchvision-0.10.0-cp38-cp38-manylinux1_x86_64.whl (22.1 MB)
[K     |████████████████████████████████| 22.1 MB 2.1 MB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 709 kB/s eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.0.0-py3-none-any.whl size=126709 sha256=22a7ee5c5484ba6e459840c7955f7dcb3ff510e93758dac2ce693ed92cc06dd1
  Stored in directory: /home/sv/.cache/pip/wheels/8c/b7/50/451c9a52a337aac5521dbc10544a69e1447d28012feba30742
Successfully built sentence-transformers
Inst

In [111]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased')

In [112]:

sentence_embeddings_lemma_1 = model.encode([i for i in df['lemma_text_1']])

In [113]:
sentence_embeddings_text_1 = model.encode([i for i in df['text_1']])

In [114]:
sentence_embeddings_lemma_2 = model.encode([i for i in df['lemma_text_2']])

[CV 2/5] END fit_intercept=True, solver=newton-cg, tol=0.001;, score=0.604 total time=   0.3s
[CV 1/5] END fit_intercept=True, solver=newton-cg, tol=0.01;, score=0.637 total time=   0.3s
[CV 2/5] END fit_intercept=True, solver=lbfgs, tol=0.0001;, score=0.604 total time=   0.5s
[CV 5/5] END fit_intercept=True, solver=lbfgs, tol=0.001;, score=0.655 total time=   0.5s
[CV 3/5] END fit_intercept=True, solver=liblinear, tol=0.0001;, score=0.614 total time=   0.6s
[CV 2/5] END fit_intercept=True, solver=liblinear, tol=0.01;, score=0.604 total time=   0.4s
[CV 5/5] END fit_intercept=True, solver=sag, tol=0.0001;, score=0.655 total time=   0.6s
[CV 1/5] END fit_intercept=True, solver=saga, tol=0.0001;, score=0.637 total time=   1.4s
[CV 2/5] END fit_intercept=True, solver=saga, tol=0.01;, score=0.602 total time=   0.4s
[CV 4/5] END fit_intercept=False, solver=newton-cg, tol=0.0001;, score=0.591 total time=   0.3s
[CV 2/5] END fit_intercept=False, solver=newton-cg, tol=0.01;, score=0.569 total 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 5/5] END fit_intercept=True, solver=newton-cg, tol=0.0001;, score=0.655 total time=   0.6s
[CV 5/5] END fit_intercept=True, solver=newton-cg, tol=0.01;, score=0.655 total time=   0.3s
[CV 5/5] END fit_intercept=True, solver=lbfgs, tol=0.0001;, score=0.655 total time=   0.5s
[CV 3/5] END fit_intercept=True, solver=lbfgs, tol=0.01;, score=0.615 total time=   0.5s
[CV 5/5] END fit_intercept=True, solver=liblinear, tol=0.0001;, score=0.655 total time=   0.6s
[CV 5/5] END fit_intercept=True, solver=liblinear, tol=0.01;, score=0.655 total time=   0.4s
[CV 3/5] END fit_intercept=True, solver=sag, tol=0.001;, score=0.615 total time=   0.4s
[CV 2/5] END fit_intercept=True, solver=sag, tol=0.01;, score=0.603 total time=   0.2s
[CV 3/5] END fit_intercept=True, solver=saga, tol=0.0001;, score=0.615 total time=   1.5s
[CV 5/5] END fit_intercept=True, solver=saga, tol=0.01;, score=0.654 total time=   0.4s
[CV 4/5] END fit_intercept=False, solver=newton-cg, tol=0.001;, score=0.591 total time=   0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 1/5] END fit_intercept=True, solver=newton-cg, tol=0.001;, score=0.637 total time=   0.4s
[CV 5/5] END fit_intercept=True, solver=newton-cg, tol=0.001;, score=0.655 total time=   0.6s
[CV 2/5] END fit_intercept=True, solver=lbfgs, tol=0.001;, score=0.604 total time=   0.5s
[CV 5/5] END fit_intercept=True, solver=lbfgs, tol=0.01;, score=0.655 total time=   0.5s
[CV 3/5] END fit_intercept=True, solver=liblinear, tol=0.001;, score=0.614 total time=   0.5s
[CV 1/5] END fit_intercept=True, solver=sag, tol=0.0001;, score=0.637 total time=   0.6s
[CV 4/5] END fit_intercept=True, solver=sag, tol=0.001;, score=0.643 total time=   0.4s
[CV 5/5] END fit_intercept=True, solver=saga, tol=0.0001;, score=0.655 total time=   1.5s
[CV 1/5] END fit_intercept=False, solver=newton-cg, tol=0.0001;, score=0.591 total time=   0.2s
[CV 5/5] END fit_intercept=False, solver=newton-cg, tol=0.0001;, score=0.620 total time=   0.2s
[CV 5/5] END fit_intercept=False, solver=newton-cg, tol=0.001;, score=0.620 tota

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [115]:
sentence_embeddings_text_2 = model.encode([i for i in df['text_2']])

In [116]:
bert_cos_sim_lem = []
for i in range(df.shape[0]):
    bert_cos_sim_lem.append(cosine_similarity([sentence_embeddings_lemma_1[i]],[sentence_embeddings_lemma_2[i]]))
                                    

In [117]:
df['bert_sim_lem'] = bert_cos_sim_lem   

In [119]:
df['bert_sim_pred_lem']=[ 0 if i<0.50 else 1 for i in df['bert_sim_lem'] ]

In [120]:
res_valuation('bert_cos_sim_lem',df['target'],df['bert_sim_pred_lem'],df['bert_sim_lem'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [121]:
bert_cos_sim_text = []
for i in range(df.shape[0]):
    bert_cos_sim_text.append(cosine_similarity([sentence_embeddings_text_1[i]],[sentence_embeddings_text_2[i]]))

In [122]:
df['bert_sim_text'] = bert_cos_sim_text   

In [123]:
df['bert_sim_pred_text']=[ 0 if i<0.5 else 1 for i in df['bert_sim_text'] ]

In [124]:
res_valuation('bert_cos_sim_text',df['target'],df['bert_sim_pred_text'],df['bert_sim_text'])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [125]:
sentence_embeddings_text_1.shape

(7227, 512)

In [126]:
sentence_embeddings_text_2.shape

(7227, 512)

In [127]:
# concatenate embedding text1 and text2
c = np.concatenate((sentence_embeddings_text_1,sentence_embeddings_text_2), axis=1)

In [128]:
c.shape

(7227, 1024)

In [129]:
type(c)

numpy.ndarray

In [130]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(c, df['target'])

In [131]:
clf_gb_3 = GradientBoostingClassifier()

In [132]:
clf_gb_3.fit(train_features ,train_labels)

GradientBoostingClassifier()

In [133]:
res_valuation('bert_text_gbc',test_labels,clf_gb_3.predict(test_features),clf_gb_3.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [134]:
# embedding 2 sentense together for classification models

In [135]:
sentence_embeddings = model.encode([i for i in df['text']])

In [136]:
train_features, test_features, train_labels, test_labels = train_test_split(sentence_embeddings, df['target'])

In [137]:

clf_gbc_4 = GradientBoostingClassifier()

In [138]:
clf_gbc_4.fit(train_features,train_labels)

GradientBoostingClassifier()

In [139]:
res_valuation('bert_text_sum_gdc',test_labels,clf_gbc_4.predict(test_features),clf_gbc_4.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [140]:
lr_clf_3 = LogisticRegression()
lr_clf_3.fit(train_features, train_labels)

LogisticRegression()

In [141]:
res_valuation('bert_text_sum_log',test_labels,lr_clf_3.predict(test_features),lr_clf_3.predict_proba(test_features)[:,1])

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
1,spacy_text,0.79,0.74,0.76,0.86,0.81
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
5,cv_log,0.69,0.68,0.72,0.82,0.77
6,cv_trunc_log,0.68,0.65,0.65,1.0,0.79
7,cv_trunc_gbc,0.69,0.69,0.69,0.93,0.79
8,d2v_cos_sim,0.61,0.64,0.64,1.0,0.78
9,d2v_text_gbc,0.68,0.67,0.68,0.93,0.79


In [142]:
df_score.sort_values(by=['roc_auc','f_1'],ascending=False)

Unnamed: 0,model,roc_auc,accuracy,precision,recall,f_1
0,jaccard,0.86,0.79,0.8,0.91,0.85
13,bert_cos_sim_text,0.85,0.77,0.75,0.98,0.85
4,tf_cos_sim,0.85,0.77,0.84,0.8,0.82
14,bert_text_gbc,0.84,0.78,0.76,0.95,0.84
3,cv_cos_sim,0.84,0.77,0.8,0.85,0.83
2,spacy_lemma_text,0.82,0.76,0.74,0.98,0.84
12,bert_cos_sim_lem,0.82,0.77,0.74,0.98,0.84
1,spacy_text,0.79,0.74,0.76,0.86,0.81
16,bert_text_sum_log,0.76,0.72,0.73,0.9,0.81
15,bert_text_sum_gdc,0.73,0.71,0.72,0.91,0.8


## for web-server bert_cos_sim_text

In [144]:
sent_1 = 'завтра в москве будет дождь'

In [145]:
sent_2 = 'ясное солнце светило все лето'

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased')

In [160]:
def similarity_sent(sent_1,sent_2):
    sentence_embeddings_sent_1 = model.encode(sent_1).reshape(1,512)
    sentence_embeddings_sent_2 = model.encode(sent_2).reshape(1,512)
    if cosine_similarity(sentence_embeddings_sent_1,sentence_embeddings_sent_2) > 0.5:
        return 'These sentences are similar'
    return 'These sentences are not similar'

In [161]:
similarity_sent(sent_1,sent_2)

'These sentences are not similar'

In [165]:
similarity_sent('зимой заяц белый, а летом серый','зимой наряжают елку и в лесу можно встретить зайца')

'These sentences are not similar'

In [166]:
similarity_sent('зимой заяц белый, а летом серый','летом заяц меняет цвет шубки с белого на серый ')

'These sentences are similar'