In [0]:
!pip install Cython numpy
!pip install git+https://github.com/lopuhin/python-adagram.git
!pip install pymorphy2

In [0]:
import nltk
import itertools
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import adagram
from lxml import html
from string import punctuation
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize



import json, os
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [0]:
nltk.download('stopwords')
nltk.download('punkt')
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))
punct = punctuation+'«»—…“”*№–'

In [0]:
def tokenize(text):
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

def normalize(text):
    words = tokenize(text)
    words = [morph.parse(word)[0].normal_form for word in words if word]

    return words

    

In [0]:
!wget https://raw.githubusercontent.com/mannefedov/compling_nlp_hse_course/master/data/corpus_ng.txt

In [0]:
corpus = open('corpus_ng.txt').read()

In [0]:
corpus = normalize(corpus)

In [0]:
f = open('corpus.txt', 'w')
f.write(' '.join(corpus))
f.close()

In [0]:
!adagram-train corpus.txt out.pkl

In [0]:
vm = adagram.VectorModel.load("out.pkl")

In [0]:
corpus_xml = html.fromstring(open("paraphrases.xml", 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [0]:
data

Unnamed: 0,text_1,text_2,label
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0
2,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0
3,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,-1
4,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0
...,...,...,...
7222,Путин освободил от должности ряд генералов,Путин снял с должностей более 20 руководителей...,0
7223,Облака над Москвой в День Победы разгонят девя...,Путеводитель по Дню Победы: как провести 9 мая...,-1
7224,Любляна отпразднует День Победы вместе с Москвой,В Москве ограничат движение в связи с Днем Победы,-1
7225,Девять самолетов ВВС разгонят облака над Москв...,В Москве ограничат движение в связи с Днем Победы,-1


In [0]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [0]:
words = [0,1,2,3,4,5,6,7,8,9]

def get_words_in_context(words, window=3):
    words_in_context = []

    for i in range(len(words)):
        word = words[i]
        left = words[max(0, i - window):i]
        right = words[i + 1:min(len(words), i + window + 1)]
        context = left + right
        words_in_context.append([word, context])

    return words_in_context
    

In [15]:
get_words_in_context(words)

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

In [0]:
def sense_vec(word, context, model):
    sense = model.disambiguate(word, context).argmax()
    sense_vec = model.sense_vector(word, sense)
    return sense_vec

In [0]:
def get_embedding_adagram(text, model, window= 3, dim=100):
    
    word2context = get_words_in_context(text, window)
    
    
    vectors = np.zeros((len(word2context), dim))
    
    for i, (word, context) in enumerate(word2context):
        
        try:
            v = sense_vec(word, context, model)
            vectors[i] = v
        
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [0]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [0]:
X_text_1 = [get_embedding_adagram(text, vm) for text in data['text_1_norm']]
X_text_2 = [get_embedding_adagram(text, vm) for text in data['text_2_norm']]

X_text = np.concatenate([X_text_1, X_text_2], axis=1)



In [0]:
clf = svm.SVC(kernel='linear', C=1)

In [0]:
y = data.label.values

In [0]:
clf = svm.SVC(kernel='linear', C=1)

In [0]:
scores = cross_val_score(clf, X_text, y, cv=5, scoring='f1_macro')

In [24]:
scores

array([0.31782897, 0.34394189, 0.35456698, 0.29280354, 0.30169771])

In [25]:
scores.mean()

0.3221678174827987

## Задание 2. Реализовать алгоритм Леска и проверить его на реальном датасете

In [0]:
nltk.download('wordnet')

In [0]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    synsets = [wn.synsets(word)[i].definition() for i in range(len(wn.synsets(word)))]

    if type(sentence) is str:
        tokens = [token.strip() for token in sentence.split()]
    elif type(sentence) is list:
        tokens = sentence
    
    for i, syns in enumerate(synsets):
        n=0
        syns_tokens = syns.split()
        for token in tokens:
            if token in syns_tokens:
                n +=1
        if n > maxoverlap:
            bestsense = i
            maxoverlap = n
    return bestsense

In [28]:
lesk('day', 'some point or period in time'.split())

1

In [0]:
corpus_wsd = []
corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

In [0]:
corpus_wsd[:5]

In [0]:
corpus_short = corpus_wsd[:1000]
wordnet_list = [] 
lesk_list = [] 

for i, sent in enumerate(corpus_short):
    if sent[0][0]:
      context = []

      for w in sent:
        if '%' in w[0]:
          context.append(w[1])
          wn_var = wn.lemma_from_key(w[0]).synset()
          wordnet_list.append(wn_var)
      
      words_in_context = get_words_in_context(context)
      
      for st in words_in_context:
        i = lesk(st[0], st[1])
        lesk_var = wn.synsets(st[0])[i]
        lesk_list.append(lesk_var)

In [0]:
lists = {'wordnet': wordnet_list, 'lesk': lesk_list}
df = pd.DataFrame(data=lists)

In [0]:
match = np.zeros(df.shape[0])

In [0]:
for i, syn in enumerate(df['wordnet'].values):
  if df['lesk'][i] == syn:
    match[i] = 1
  else:
    match[i] = 0

In [54]:
df['match'] = match
df.head(15)

Unnamed: 0,wordnet,lesk,match
0,Synset('be.v.01'),Synset('beryllium.n.01'),0.0
1,Synset('bigger.s.01'),Synset('bigger.s.01'),1.0
2,Synset('fancy.a.01'),Synset('fancy.n.02'),0.0
3,Synset('truly.r.01'),Synset('truly.r.01'),1.0
4,Synset('want.v.02'),Synset('need.n.01'),0.0
5,Synset('exist.v.01'),Synset('beryllium.n.01'),0.0
6,Synset('other.a.01'),Synset('other.a.01'),1.0
7,Synset('cheap.a.01'),Synset('cheap.a.01'),1.0
8,Synset('communication.n.01'),Synset('communication.n.01'),1.0
9,Synset('technique.n.01'),Synset('technique.n.01'),1.0


In [0]:
succeed = df[df['match'] == 1].shape[0]
allv = df['match'].shape[0]
accuracy = succeed / allv

In [56]:
accuracy

0.5414615675880349