In [23]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from string import punctuation
import random

punct = punctuation+'«»—…“”*№–'

[nltk_data] Downloading package wordnet to /Users/vera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Посмотрим, что вообще может значить break:

In [11]:
word = 'break'
for num, synset in enumerate(wn.synsets(word)):
    print(num+1, word + ' - ' + synset.definition())

1 break - some abrupt occurrence that interrupts an ongoing activity
2 break - an unexpected piece of good luck
3 break - (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 break - a personal or social separation (as between opposing factions)
5 break - a pause from doing something (as work)
6 break - the act of breaking something
7 break - a time interval during which there is a temporary cessation of something
8 break - breaking of hard tissue such as bone
9 break - the occurrence of breaking
10 break - an abrupt change in the tone or register of the voice (as at puberty or due to emotion)
11 break - the opening shot that scatters the balls in billiards or pool
12 break - (tennis) a score consisting of winning a game when your opponent was serving
13 break - an act of delaying or interrupting the continuity
14 break - a sudden dash
15 break - any frame in which a bowler fails to make a strike or spare
16 break - an escape fr

Мда, вот это радость лексикографа...

Вспомним алгоритм Леска.

In [44]:
def get_words_in_context(words, window):
    words_in_context = []
    for i in range(len(words)):
        left = words[max(0,i-window):i]
        right = words[i+1:i+window+1]
        target = words[i]
        words_in_context.append((target, left+right))

    return words_in_context

def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    for i, synset in enumerate(wn.synsets(word)):
        definition = tokenize(synset.definition())
        overlap = len(set(definition) & set(sentence))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i   
    
    return bestsense

И посмотрим на данные:

In [12]:
corpus = open('corpus_eng.txt', encoding = 'utf-8').readlines()

def tokenize(text):
    words = [word.strip(punct) for word in text.lower().split() if word]
    
    return words

tokenized_corpus = [tokenize(line) for line in corpus]


In [17]:
len(tokenized_corpus)
tokenized_corpus[:5]


[['states',
  'to',
  'watch',
  'on',
  'election',
  'day',
  '11/7/2016',
  '6:00am',
  'trump',
  'and',
  'clinton',
  'will',
  'put',
  'their',
  'election',
  'strategies',
  'to',
  'the',
  'test',
  'on',
  'tuesday',
  "wsj's",
  'gerald',
  'f',
  'seib',
  'discusses',
  'which',
  'states',
  'to',
  'watch',
  'as',
  'the',
  'polls',
  'close',
  'and',
  'which',
  'states',
  'both',
  'candidates',
  'need',
  'to',
  'win',
  'in',
  'order',
  'to',
  'claim',
  'victory',
  'photo',
  'ap',
  'transcript'],
 ['this',
  'transcript',
  'has',
  'been',
  'automatically',
  'generated',
  'and',
  'may',
  'not',
  'be',
  '100',
  'accurate',
  '',
  "it's",
  'been',
  'a',
  'long',
  'road',
  'but',
  'we',
  'finally',
  'arrived',
  'at',
  'election',
  'week',
  '',
  "tuesday's",
  'election',
  'day',
  '',
  'and',
  'the',
  'question',
  'is',
  "what's",
  'happened',
  'in',
  'this',
  'race',
  'over',
  'the',
  'last',
  'few',
  'weeks',
  'w

Найдём предложения со словом break.

In [18]:
sents_with_break = [line for line in tokenized_corpus if 'break' in line]

In [21]:
len(sents_with_break)
sents_with_break[:5]

[['an',
  'indian',
  'protest',
  'for',
  'everyone',
  'by',
  'david',
  'treuer',
  'photo',
  'demonstrators',
  'near',
  'the',
  'standing',
  'rock',
  'reservation',
  'protesting',
  'the',
  'dakota',
  'access',
  'pipeline',
  'credit',
  'stephanie',
  'keith/reuters',
  'every',
  'protest',
  'contains',
  'a',
  'contradiction',
  'people',
  'stand',
  'up',
  '',
  'through',
  'speech',
  'demonstration',
  'violent',
  'or',
  'nonviolent',
  'action',
  '',
  'and',
  'urge',
  'the',
  'state',
  'to',
  'change',
  'they',
  'break',
  'the',
  'rules',
  'in',
  'order',
  'to',
  'convince',
  'the',
  'rule-makers',
  'that',
  'they',
  'need',
  'to',
  'change',
  'the',
  'rules',
  'which',
  'is',
  'itself',
  'a',
  'kind',
  'of',
  'state-approved',
  'process',
  'however',
  'at',
  'standing',
  'rock',
  'in',
  'north',
  'dakota',
  'indians',
  'from',
  'all',
  'over',
  'north',
  'america',
  'have',
  'been',
  'protesting',
  'for',
 

sample_sents_with_break = random.sample(sents_with_break, 10)

In [38]:
for num, sent in enumerate(sample_sents_with_break):
    print(num + 1, ' '.join(sent), '\n')

1 bayern munich suffered a 3-2 loss against russia's rostov in the champions league on wednesday afp photo/kirill kudryavstev more berlin afp  faltering bayern munich hope to break their second mini winless streak this season when they host bayer leverkusen on saturday to launch their counter-attack on top spot in germany 

2 a few hours after taking a break from promotional duties with iris redmayne chatted in a downtown manhattan hotel about his headlong dive into rowling's empire the film's multicultural message and just how many movies he's gotten himself into 

3 when the san francisco giants faltered after entering the all-star break with the majors’ best record the dodgers pounced and overtook them despite kershaw’s absence by the time the three-time cy young award winner returned sept 9 los angeles was ahead by five games and well on the way to a 91-71 record our picks for mlb mvp cy young rookie of the year and manager of the year 

4 the visitors however with 211 runs needed 

Выпишем значения слова break для каждого предложения:
1. Прервать, прекратить, сломать (метафорично) - 52
2. Перерыв - 4
3. Матч всех звёзд (пришлось гуглить статью, чтобы это понять). - такого вообще нет в предложенных значениях
4. Перерыв - 4, 7
5. Перерыв - 4, 7
6. Выиграть гейм в теннисе после того, как проиграл предыдущий. - 12
7. Быстрый брейк (?) - баскетбольный термин.
8. Опять что-то из бакскетбола, но возможно и перерыв (?) - 4, 7
9. прервать - 1, 13, 17
10. перемена в школе - 5, 7

Посмотрим, что нам выдаст алгоритм Леска (может из него лексикографи получше, чем из меня).

In [74]:
def define_a_word(sample_sents_with_break, window):
    for num, sent in enumerate(sample_sents_with_break):
        contexts = get_words_in_context(sent, window)
        for word, context in contexts:
            if word == "break":
                def_number = lesk(word, context)
                print(num + 1, def_number + 1, wn.synsets(word)[def_number].definition())
                
                

In [75]:
define_a_word(sample_sents_with_break, 5)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
5 10 an abrupt change in the tone or register of the voice (as at puberty or due to emotion)
6 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 70 become fractured; break or crack on the surface only
7 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 70 become fractured; break or crack on the surface only
8 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or co

## Изменим размер окна

Мда. Ничего не совпало. Попробуем изменить размер окна на 3.

In [76]:
define_a_word(sample_sents_with_break, 3)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
5 10 an abrupt change in the tone or register of the voice (as at puberty or due to emotion)
6 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 70 become fractured; break or crack on the surface only
8 70 become fractured; break or crack on the surface only


In [83]:
define_a_word(sample_sents_with_break, 3)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
5 10 an abrupt change in the tone or register of the voice (as at puberty or due to emotion)
6 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 70 become fractured; break or crack on the surface only
8 70 become fractured; break or crack on the surface only


Значение 7 тоже подходит под описание перемены, поэтому зачтём его. Остальные всё так же неверны (откуда столько геологии?) С другими значениями окна мы возвращаемся к предыдущим предложенным значениям.

## Берем примеры, а не значения

In [90]:
def get_words_in_context(words, window):
    words_in_context = []
    for i in range(len(words)):
        left = words[max(0,i-window):i]
        right = words[i+1:i+window+1]
        target = words[i]
        words_in_context.append((target, left+right))

    return words_in_context

def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    for i, synset in enumerate(wn.synsets(word)):
        example = tokenize(' '.join(synset.examples()))
        overlap = len(set(example) & set(sentence))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i   
    
    return bestsense

In [91]:
define_a_word(sample_sents_with_break, 3)

1 4 a personal or social separation (as between opposing factions)
2 47 discontinue an association or relation; go different ways
3 34 interrupt a continued activity
4 33 stop operating or functioning
5 31 make known to the public information that was previously known only to a few people or that was meant to be kept a secret
6 4 a personal or social separation (as between opposing factions)
7 1 some abrupt occurrence that interrupts an ongoing activity
7 1 some abrupt occurrence that interrupts an ongoing activity
7 1 some abrupt occurrence that interrupts an ongoing activity
8 33 stop operating or functioning
9 33 stop operating or functioning
10 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act


Тут выдача поинтереснее, хотя всё равно ничего не совпало. Но в принципе некоторые значения с большой натяжкой можно и отнести к верным (4, 8, 9)

## Берём и примеры, и значения

In [92]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        def_and_examples = []
        definition = tokenize(synset.definition())
        def_and_examples.extend(definition)
        for example in synset.examples():
            def_and_examples.extend(tokenize(' '.join(synset.examples())))
        overlap = len(set(def_and_examples) & set(sentence))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i
    
    return bestsense

In [94]:
define_a_word(sample_sents_with_break, 5)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
5 8 breaking of hard tissue such as bone
6 21 destroy the integrity of; usually by force; cause to separate into pieces or fragments
7 12 (tennis) a score consisting of winning a game when your opponent was serving
7 1 some abrupt occurrence that interrupts an ongoing activity
7 1 some abrupt occurrence that interrupts an ongoing activity
8 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
9 33 stop operating or functioning
10 27 enter someone's

Стало будто бы хуже. Хотя сейчас появилось значение из тенниса, но оно приписалось примеру из баскетбола. Попробуем и тут поиграть с размером окна.

In [95]:
define_a_word(sample_sents_with_break, 3)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
4 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
5 10 an abrupt change in the tone or register of the voice (as at puberty or due to emotion)
6 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
7 1 some abrupt occurrence that interrupts an ongoing activity
7 1 some abrupt occurrence that interrupts an ongoing activity
7 1 some abrupt occurrence that interrupts an ongoing activity
8 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
9 

In [96]:
define_a_word(sample_sents_with_break, 10)

1 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
2 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
3 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
4 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
5 8 breaking of hard tissue such as bone
6 21 destroy the integrity of; usually by force; cause to separate into pieces or fragments
7 12 (tennis) a score consisting of winning a game when your opponent was serving
7 22 act in disregard of laws, rules, contracts, or promises
7 1 some abrupt occurrence that interrupts an ongoing activity
8 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
9 33 stop operating or functioning
10 2

Не помогло.

## А зачем нам контекст?

In [98]:
def define_a_word_without_a_context(sample_sents_with_break, window):
    for num, sent in enumerate(sample_sents_with_break):
        for word in sent:
            if word == "break":
                def_number = lesk(word, sent)
                print(num + 1, def_number + 1, wn.synsets(word)[def_number].definition())

In [105]:
define_a_word_without_a_context(sample_sents_with_break, 10)

1 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
2 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
3 8 breaking of hard tissue such as bone
4 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
5 3 (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
6 8 breaking of hard tissue such as bone
7 22 act in disregard of laws, rules, contracts, or promises
7 22 act in disregard of laws, rules, contracts, or promises
7 22 act in disregard of laws, rules, contracts, or promises
8 27 enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
9 15 any frame in which a bowler fails to make a strike or spare
10 27 enter someone's (virtual or real) property

In [None]:
Видимо, контекст нам все-таки нужен, и идея брать все предложение не так уж хороша.

## NLTK

In [109]:
from nltk.wsd import lesk
for num, sent in enumerate(sample_sents_with_break):
    definition = lesk(sent, 'break').definition()
    print(definition, sep='\n')

(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
enter someone's (virtual or real) property in an unauthorized manner, usually with the intent to steal or commit a violent act
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
any frame in which a bowler fails to make a strike or spare
(geology) a crack in the earth's crust resulting f

In [None]:
Окей, все стало только хуже.