In [7]:
# importing the required packages
import pprint

import numpy as np
import nltk
from nltk.corpus import wordnet
# nltk.download('wordnet')
from nltk.corpus import stopwords
# nltk.download('stopwords')

## We now define the tokenize method which takes in a string and returns the tokenized form with the word and stopwords removed.

In [8]:
stopwords_en = set(stopwords.words('english'))


def tokenize(document: str, word: str) -> set:
    # obtaining tokens from the gloss
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)

    # removing stop words from tokens
    tokens = [token for token in tokens if token not in stopwords_en and token.isalpha()]

    # removing the word from the tokens
    tokens = [token for token in tokens if token != word]
    return set(tokens)

### We now define the simple LESK Algorithm which will take in the gloss and the word and will return the disambiguated sense from the wordnet corpus.

In [9]:
def simple_lesk(gloss: str, word: str):
    """":returns the sense most suited to the given word as per the Simple LESK Algorithm"""

    # converting everything to lowercase
    gloss = gloss.lower()
    word = word.lower()

    # obtaining tokens from the gloss
    gloss_tokens = tokenize(gloss, word)

    # calculating the word sense disambiguation using simple LESK
    synsets = wordnet.synsets(word)
    weights = [0] * len(synsets)
    N_t = len(synsets)
    N_w = {}

    # Creating the IDF Frequency column using Laplacian Scaling
    for gloss_token in gloss_tokens:
        N_w[gloss_token] = 1

        for sense in synsets:
            if gloss_token in sense.definition():
                N_w[gloss_token] += N_t
                continue

            for example in sense.examples():
                if gloss_token in example:
                    N_w[gloss_token] += N_t
                    break

    for index, sense in enumerate(synsets):
        # adding tokens from examples into the comparison set
        comparison = set()
        for example in sense.examples():
            for token in tokenize(example, word):
                comparison.add(token)

        # adding tokens from definition into the comparison set
        for token in tokenize(sense.definition(), word):
            comparison.add(token)

        # comparing the gloss tokens with comparison set
        for token in gloss_tokens:
            if token in comparison:
                weights[index] += np.log(N_w[token] / N_t)

    max_weight = max(weights)
    index = weights.index(max_weight)
    return synsets[index], weights

In [11]:

gloss = input("Please enter gloss: ")
word = input("Please enter disambiguated word :")
sense, weights = simple_lesk(gloss, word)
print('The disambiguated meaning is:', sense.definition())
print('The weight vector is:', weights)

Please enter gloss: I want to visit java
Please enter disambiguated word :java
The disambiguated meaning is: an island in Indonesia to the south of Borneo; one of the world's most densely populated regions
The weight vector is: [0, 0, 0]


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r