In [0]:
#!pip install pymorphy2

In [0]:
#!pip install Cython numpy
#!pip install git+https://github.com/lopuhin/python-adagram.git

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
import adagram
from lxml import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os
from collections import Counter
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
import zipfile
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

def normalize(text):
    
    words = tokenize(text)
    words = [morph.parse(word)[0].normal_form for word in words if word]

    return words

In [0]:
from google.colab import files
zip_file = files.upload()

In [0]:
zipcorpus = zipfile.ZipFile('paraphraser.zip')
zipcorpus.extractall()
zipcorpus.close

##Адаграм

In [0]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [0]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [0]:
#!wget 'https://s3.amazonaws.com/kostia.lopuhin/all.a010.p10.d300.w5.m100.nonorm.slim.joblib'

In [0]:
vm = adagram.VectorModel.load('all.a010.p10.d300.w5.m100.nonorm.slim.joblib')

Получение целевых слов и контекста с заданным окном (по умолчанию окно -3)

In [0]:
words = [0,1,2,3,4,5,6,7,8,9]
def get_words_in_context(words, window=3):
    words_in_context = []
    for i in range(len(words)):
      word = words[i]
      before_ind = max(0, i - window) 
      after_ind = min(len(words), i + window + 1)
      before_word = words[before_ind:i]
      after_word = words[i+1:after_ind]
      context = before_word + after_word
      words_in_context += [[word, context]]
    return words_in_context

In [117]:
get_words_in_context(words)

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

Векторизуем тексты с помощью Адаграма

In [0]:
def get_embedding_ad(model, text, dim=300, window=3):
  words_in_context = get_words_in_context(text, window)
  vectors = np.zeros((len(words_in_context), dim))
  for i,[word, context] in enumerate(words_in_context):
    try:
      main_sense = model.disambiguate(word, context).argmax()
      v = model.sense_vector(word, main_sense)
      vectors[i] = v
    except (KeyError, ValueError):
      #print('Что-то пошло не так')
      continue
  if vectors.any():
    vector = np.average(vectors, axis=0)
  else:
    vector = np.zeros((dim))
  return vector

In [0]:
get_embedding_ad(vm, ['ломать', 'не', 'строить'])

In [0]:
text_1_ad = [get_embedding_ad(vm, text) for text in data['text_1_norm']]
text_2_ad = [get_embedding_ad(vm, text) for text in data['text_2_norm']]

In [0]:
X_text = np.concatenate([text_1_ad, text_2_ad], axis=1)
y = data.label.values
clf = LogisticRegression()

In [152]:
np.mean(cross_val_score(clf, X_text, y, scoring='f1_micro', cv=5))

0.42451817925119767

##Алгоритм Леска


In [0]:
import nltk
nltk.download('wordnet')

Функция для алгоритма Леска

In [0]:
def lesk(word, sentence):
    norm_sent = set(sentence)
    synsets =  wn.synsets(word)
    intersections = {}
    for i, syns in enumerate(synsets):
      definition = normalize(syns.definition())
      intersections[i] = len(set(definition).intersection(norm_sent))
    bestsence = max(intersections, key = intersections.get )
    return bestsence

In [326]:
lesk('day', 'a day assigned to a particular purpose or observance'.split())

2

In [260]:
wn.synsets('day')[2].definition()

'a day assigned to a particular purpose or observance'

In [0]:
#corpus_file = files.upload()

In [0]:
corpus_wsd = []
corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

Функция вычисления точности работы алгоритма Леска

In [0]:
def lesk_acc(corp):
  correct = 0
  total = 0
  for sentence in corp:
    norm_sentence = [word[1] for word in sentence]
    for word in sentence:
      if word[0]: # только многозначные слова
        t_word =  word[1] # target word
        if wn.synsets(t_word)[lesk(t_word, norm_sentence)] == wn.lemma_from_key(word[0]).synset():
          correct += 1
        total += 1
  acc = correct / total
  return acc

In [313]:
%%time
print('Accuracy =', lesk_acc(corpus_wsd[:10000]))

Accuracy = 0.3677168706758914
CPU times: user 2min 54s, sys: 3.51 s, total: 2min 58s
Wall time: 2min 58s


Поскольку рассматривались только многозначные слова(от 2 и более значений) - вероятность угадывания должна быть меньше, чем 0.5. Соответственно и результат скорее всего лучше случайного. Для проверки можно написать похожую функцию, которая будет предлагать случайное значение слова.

In [0]:
def random_acc(corp):
  correct = 0
  total = 0
  for sentence in corp:
    norm_sentence = [word[1] for word in sentence]
    for word in sentence:
      if word[0]: # только многозначные слова
        t_word =  word[1] # target word
        if wn.synsets(t_word)[np.random.choice(len(wn.synsets(t_word)))] == wn.lemma_from_key(word[0]).synset():
          correct += 1
        total += 1
  acc = correct / total
  return acc

In [335]:
%%time
print('Random accuracy =', random_acc(corpus_wsd[:10000]))

Random accuracy = 0.30203299627461416
CPU times: user 29 s, sys: 3.14 s, total: 32.1 s
Wall time: 32.2 s


Таким образом, можно констатировать, что точность для алгоритма Леска лучше, чем случайный подбор возможного значения.