In [None]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
import string
import re
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Text2TextGenerationPipeline
import sentencepiece

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_punctuation(text):
    pf = "".join([i for i in text if i not in string.punctuation])
    return pf

def turn_to_lowercase(text):
  return text.lower()

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = "".join([wordnet_lemmatizer.lemmatize(word) for word in text])
  return lemm_text

def tokenization(text):
    return text.split(" ")

def remove_stopwords(wordList):
    output = [w for w in wordList if w not in stopwords]
    return output

In [None]:
sentence = "I would avoid the sushi if I was you. It is a little fishy."

In [None]:
s_transformed = remove_stopwords(tokenization(lemmatizer(turn_to_lowercase(remove_punctuation(sentence)))))
s_transformed

['would', 'avoid', 'sushi', 'little', 'fishy']

In [None]:
matches = []
for w in s_transformed:
  if len(wn.synsets(w)) > 1:
    matches.append(w)
matches

['avoid', 'little', 'fishy']

In [None]:
for w in matches:
  print(f"{w.capitalize()}")
  for s in wn.synsets(w):
    print(s, ": ", s.definition(), end="\n")
  print("\n")

Avoid
Synset('avoid.v.01') :  stay clear from; keep away from; keep out of the way of someone or something
Synset('debar.v.02') :  prevent the occurrence of; prevent from happening
Synset('avoid.v.03') :  refrain from doing something
Synset('keep_off.v.01') :  refrain from certain foods or beverages
Synset('invalidate.v.01') :  declare invalid


Little
Synset('little.n.01') :  a small amount or duration
Synset('small.a.01') :  limited or below average in number or quantity or magnitude or extent
Synset('little.a.02') :  (quantifier used with mass nouns) small in quantity or degree; not much or almost none or (with `a') at least some
Synset('little.s.03') :  (of children and animals) young, immature
Synset('fiddling.s.01') :  (informal) small and of little importance
Synset('little.s.05') :  (of a voice) faint
Synset('short.a.03') :  low in stature; not tall
Synset('little.s.07') :  lowercase
Synset('little.s.08') :  small in a way that arouses feelings (of tenderness or its opposite de

In [None]:
pipe = Text2TextGenerationPipeline(
    model = AutoModelForSeq2SeqLM.from_pretrained("jpwahle/t5-word-sense-disambiguation"),
    tokenizer = AutoTokenizer.from_pretrained("jpwahle/t5-word-sense-disambiguation"))

In [None]:
def wsd(word, context, descriptions):
  question = 'question: which description describes the word' + ' " ' + word + ' " '
  _d = ", ".join(str(f'" {n} "') for n in descriptions)
  descriptions_context = f'best in the following context? \descriptions:[ {_d} ] context: ' + context + "'"
  raw_input = question + descriptions_context
  output = pipe(raw_input)[0]['generated_text']
  return output

In [None]:
wsd(
    'fishy',
    'I would avoid the sushi if I were you. It is a little fishy.', 
    [
      'of or relating to or resembling fish',
      'not as expected'
    ]
)

'of or relating to or resembling fish'

In [None]:
for w in matches:
  print(f"{w.capitalize()}")
  des = []
  for s in wn.synsets(w):
    des.append(s.definition())
  print(wsd(w, sentence, des))
  print("\n")

Avoid
refrain from certain foods or beverages


Little
limited or below average in number or quantity or magnitude or extent


Fishy
of or relating to or resembling fish




# Demo

In [None]:
demo_sent = input('Enter a sentence: ')

Enter a sentence: How did the picture end up in jail? It was framed!


In [None]:
demo_sent_clean = remove_stopwords(tokenization(lemmatizer(turn_to_lowercase(remove_punctuation(demo_sent)))))
demo_sent_clean

['picture', 'end', 'jail', 'framed']

In [None]:
demo_matches = []
for w in demo_sent_clean:
  if len(wn.synsets(w)) > 1:
    demo_matches.append(w)
demo_matches

['picture', 'end', 'jail', 'framed']

In [None]:
for w in demo_matches:
  print(f"{w.capitalize()}")
  for s in wn.synsets(w):
    print(s, ": ", s.definition(), end="\n")
  print("\n")

Picture
Synset('picture.n.01') :  a visual representation (of an object or scene or person or abstraction) produced on a surface
Synset('painting.n.01') :  graphic art consisting of an artistic composition made by applying paints to a surface
Synset('mental_picture.n.01') :  a clear and telling mental image
Synset('picture.n.04') :  a situation treated as an observable object
Synset('picture.n.05') :  illustrations used to decorate or explain a text
Synset('movie.n.01') :  a form of entertainment that enacts a story by sound and a sequence of images giving the illusion of continuous movement
Synset('video.n.01') :  the visible part of a television transmission
Synset('word_picture.n.01') :  a graphic or vivid verbal description
Synset('picture.n.09') :  a typical example of some state or quality
Synset('photograph.n.01') :  a representation of a person or scene in the form of a print or transparent slide; recorded by a camera on light-sensitive material
Synset('visualize.v.01') :  imag

In [None]:
for w in demo_matches:
  print(f"{w.capitalize()}")
  des = []
  for s in wn.synsets(w):
    des.append(s.definition())
  print(wsd(w, demo_sent, des))
  print("\n")

Picture
a visual representation (of an object or scene or person or abstraction) produced on a


End
have an end, in a temporal, spatial, or quantitative sense; either spatial or


Jail
lock up or confine, in or as in a jail


Framed
provided with a frame


