# Wordnet (Домашняя работа 4)

#### В этот раз мы работаем в Wordnet, а именно нам необходимо самим реализовать алгоритм Леска

Импортируем всё необходимое

In [245]:
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import wordnet as wn
from string import punctuation
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

stemmer = EnglishStemmer()

punct = punctuation + '«»—…“”*№–'

#### Теперь, реализуем функцию, которая будет принимать на вход какой-то текст и разбивать его на слово и его контекст

In [246]:
def get_words_in_context(text, window = 3):
    words = text.split()
    for num, word in enumerate(words):
        if num <= window:
            yield (word, words[:num] + words[num + 1:num + 1 + window])
        else:
            yield (word, words[num - window:num] + words[num + 1 :num + 1 + window])

#### Давайте наберем какой-нибудь текст и проверим работу нашей функции

In [247]:
text = '''After swearing off music due to an incident
at the middle school regional concert band competition,
euphonist Kumiko Oumae enters high school hoping for a fresh start.'''

#### Смотрим, что получается...

In [248]:
list(get_words_in_context(text))

[('After', ['swearing', 'off', 'music']),
 ('swearing', ['After', 'off', 'music', 'due']),
 ('off', ['After', 'swearing', 'music', 'due', 'to']),
 ('music', ['After', 'swearing', 'off', 'due', 'to', 'an']),
 ('due', ['swearing', 'off', 'music', 'to', 'an', 'incident']),
 ('to', ['off', 'music', 'due', 'an', 'incident', 'at']),
 ('an', ['music', 'due', 'to', 'incident', 'at', 'the']),
 ('incident', ['due', 'to', 'an', 'at', 'the', 'middle']),
 ('at', ['to', 'an', 'incident', 'the', 'middle', 'school']),
 ('the', ['an', 'incident', 'at', 'middle', 'school', 'regional']),
 ('middle', ['incident', 'at', 'the', 'school', 'regional', 'concert']),
 ('school', ['at', 'the', 'middle', 'regional', 'concert', 'band']),
 ('regional', ['the', 'middle', 'school', 'concert', 'band', 'competition,']),
 ('concert',
  ['middle', 'school', 'regional', 'band', 'competition,', 'euphonist']),
 ('band',
  ['school', 'regional', 'concert', 'competition,', 'euphonist', 'Kumiko']),
 ('competition,',
  ['regiona

#### Неплохо, тепер можно переходить к написанию нашего алгоритма.

Замечу, что на предыдущем этапе я не чистил пунктуацию и не проводил предобработку слов, поэтому проведу это уже в самом алгоритме.

In [249]:
def lesk(word, context):
    
    word = word.strip(punct)
    
    synsets = [item for item in wn.synsets(word)]
        
    definitions = [set(item.definition().lower().strip(punct).split()) for item in synsets]
    context = set([w.lower().strip(punct) for w in context])
    
    stem_defs = [set([stemmer.stem(word) for word in item]) for item in definitions]
    stem_context = set([stemmer.stem(word) for word in context])
    
    if stem_defs:
        scores = defaultdict()
        
        for num, item in enumerate(stem_defs):
            scores[num] = (len(item.intersection(stem_context)),
                           synsets[num].definition())
        
        best = sorted(scores, key=lambda x: scores.get(x)[0], reverse=True)[0]
        return (word, context, scores[best])
    
    else:
        return 'Cannot find any definition for word "{}"'.format(word)
    
def get_lesk(word, context):
    result = lesk(word, context)
    if isinstance(result, str):
        return result
    else:
        return '''
        Input word:\t{}
        Word context:\t{}
        Definition:\t{}
        Intersection:\t{}
        '''.format(result[0],
                   ', '.join(result[1]),
                   result[2][1],
                   result[2][0])

#### Итак, у нас есть функция *get_lesk* которая выдает либо текст с найденной информацией, либо сообщает, что не удалось обнаружить определения для какого-то слова. Давайте опробуем, что получилось, на примере уже имеющегося текста

    Первый элемент – слово
    Второй элемент – контекст

In [250]:
print(get_lesk(list(get_words_in_context(text))[11][0],
               list(get_words_in_context(text))[11][1]))


        Input word:	school
        Word context:	band, regional, the, middle, at, concert
        Definition:	the process of being formally educated at a school
        Intersection:	2
        


In [251]:
for item in get_words_in_context(text):
    print(get_lesk(item[0],
                   list(item[1])))


        Input word:	After
        Word context:	music, off, swearing
        Definition:	located farther aft
        Intersection:	0
        

        Input word:	swearing
        Word context:	music, after, due, off
        Definition:	profane or obscene expression usually of surprise or anger
        Intersection:	0
        

        Input word:	off
        Word context:	music, after, swearing, due, to
        Definition:	kill intentionally and with premeditation
        Intersection:	0
        

        Input word:	music
        Word context:	after, an, off, swearing, due, to
        Definition:	an artistic form of auditory communication incorporating instrumental or vocal tones in a structured and continuous manner
        Intersection:	1
        

        Input word:	due
        Word context:	music, an, off, swearing, incident, to
        Definition:	scheduled to arrive
        Intersection:	1
        
Cannot find any definition for word "to"

        Input word:	an
        Word 

Можно заметить, что очень часто определеяющими становятся стоп-слова, давайте попробуем убрать их и повторить тест

In [257]:
from stop_words import get_stop_words

stops = get_stop_words('en')

def lesk_stops(word, context):
    
    word = word.strip(punct)
    
    synsets = [item for item in wn.synsets(word)]
        
    definitions = [set(item.definition().lower().strip(punct).split()) for item in synsets]
    context = set([w.lower().strip(punct) for w in context])
    
    stem_defs = [set([stemmer.stem(word) for word in item]) for item in definitions]
    stem_context = set([stemmer.stem(word) for word in context])
    
    if stem_defs:
        scores = defaultdict()
        
        for num, item in enumerate(stem_defs):
            scores[num] = (len(item.intersection(stem_context - set(stops))),
                           synsets[num].definition())
        
        best = sorted(scores, key=lambda x: scores.get(x)[0], reverse=True)[0]
        return (word, context, scores[best])
    
    else:
        return 'Cannot find any definition for word "{}"'.format(word)
    
def get_lesk_stops(word, context):
    result = lesk_stops(word, context)
    if isinstance(result, str):
        return result
    else:
        return '''
        Input word:\t{}
        Word context:\t{}
        Definition:\t{}
        Intersection:\t{}
        '''.format(result[0],
                   ', '.join(result[1]),
                   result[2][1],
                   result[2][0])

In [258]:
print(get_lesk_stops(list(get_words_in_context(text))[11][0],
                     list(get_words_in_context(text))[11][1]))


        Input word:	school
        Word context:	band, regional, the, middle, at, concert
        Definition:	an educational institution
        Intersection:	0
        


#### Можно заметить, что результат изменился... И возможно даже в лучшую сторону.

Повторим тест с целым текстом, но мозьмем текст поменьше (чтобы сократить аутпут).

In [254]:
new_text = 'The world rarely sees so much talent wrapped into one person'

In [255]:
def compareit(word, context):
    result1 = lesk(word, context)
    result2 = lesk_stops(word, context)
    if isinstance(result1, str) or isinstance(result2, str):
        return result1
    else:
        return '''
        Input word:\t\t{}
        Word context:\t\t{}
        Definition:\t\t{}
        Intersection:\t\t{}
        Definition (stops):\t{}
        Intersection (stops):\t{}
        '''.format(result1[0],
                   ', '.join(result1[1]),
                   result1[2][1],
                   result1[2][0],
                   result2[2][1],
                   result2[2][0])

In [256]:
for item in get_words_in_context(new_text):
    print(compareit(item[0],
                    list(item[1])))

Cannot find any definition for word "The"

        Input word:		world
        Word context:		the, rarely, sees, so
        Definition:		the 3rd planet from the sun; the planet we live on
        Intersection:		1
        Definition (stops):	everything that exists anywhere
        Intersection (stops):	0
        

        Input word:		rarely
        Word context:		sees, the, so, much, world
        Definition:		not often
        Intersection:		0
        Definition (stops):	not often
        Intersection (stops):	0
        

        Input word:		sees
        Word context:		the, rarely, so, much, talent, world
        Definition:		the seat within a bishop's diocese where his cathedral is located
        Intersection:		1
        Definition (stops):	the seat within a bishop's diocese where his cathedral is located
        Intersection (stops):	0
        

        Input word:		so
        Word context:		wrapped, sees, rarely, much, talent, world
        Definition:		the syllable naming the fif

## P.S.

#### Получилась вот такая интересная работа, простите за какие-то ошибки, увлекся и заканчиваю очень поздно)