In [1]:
import nltk
from nltk.corpus import semcor
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import Lemma
from nltk import Tree
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import numpy as np
import string

In [2]:
data=[[c for c in s] for s in semcor.tagged_sents(tag='both')]

---
### Preprocessing
The following code contains a series of functions to extract and preprocess the sentences contained in SemCor.\
This way, it will be easier to access the sentences annotated with POS tags and synsets and extract a random word to disambiguate.


In [3]:
#convert nltk pos tag to wordnet pos tag
def convert_pos(pos):
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v', 'M': 'v'}
    return pos_map.get(pos[0], pos)

In [4]:
# converter from SemCor to dictionary
# given an annotated sentence from the SemCor corpus in the form of a list of trees
# returns a list of dictionaries with the word, the POS tag, and the synset
# all stopwords and punctuation are ignored
# the initial trees for words with lemma are in the form ROOT[label(lemma)]-->child[label(posTag)]-->child[word]
# the initial trees for words without lemma are in the form ROOT[label(posTag)]-->child[word]
def extract_info(list):
    token_list = []
    for tree in list:
        token={"word":None,"pos":None,"syn":None}
        label=tree.label()
        if isinstance(label,Lemma): # if the token has a lemma, I derive the synset
            token["syn"]=label.synset()
            tree=tree[0] # explore the child that contains the word and the POS tag

        # ignore punctuation and compound words/entities (e.g., New York)
        if not isinstance(tree[0],Tree) and label is not None:
            token["pos"]=convert_pos(tree.label())
            token["word"]=tree[0].lower()
            token_list.append(token)

    return token_list


In [5]:
# returns a random word and the sentence from which it was extracted; if only_nouns=True, it necessarily returns a noun
def get_random_word(data,only_nouns=False):
    found=False
    while not found:
        sentence=random.choice(data)
        token_list=extract_info(sentence)
        content_word=[token for token in token_list if token["syn"]is not None]#exclude stopwords
        for token in content_word:
            if wn.synsets(token["word"]) != []:#if the word has at least one synset
                if not only_nouns or token["pos"]=="n":
                        return (token,token_list)

---
### Implementazione WSD
The following code constitutes the main part of the project, containing the functions to obtain the context, the signature, to implement the overlap, and the Lesk algorithm.


In [6]:
# preprocesses the data by removing punctuation, tokenizing, and lemmatizing
def preprocess(sentence): 
    if isinstance(sentence,str): # if it is a string, tokenize it into a list
        sentence=word_tokenize(sentence)
    sentence=[w.lower() for w in sentence if w not in string.punctuation]
    lemmatizer = WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(w) for w in sentence]
    return tokens


In [7]:
# given a list of synsets and the annotated sentences from SemCor
# checks each sentence that contains a certain synset
# returns a dictionary, where each synset is associated with a list of tokenized sentences in which it appears
# corpus_examples= {syn1: [[example1],[example2],[example3]], syn2:[...]}
def get_corpus_examples(synsets,data):
    corpus_examples={syn.name():[] for syn in synsets}
    for sentence in data:
        token_list=extract_info(sentence)
        for token in token_list:
            syn=token["syn"]
            if syn is not None and syn in synsets:
                corpus_examples[syn.name()].append([t["word"]for t in token_list])
                break
    return corpus_examples


In [8]:
# for each word in word_list increments the number of documents containing the word
# if the word is not present, it is added
def add_to_signature(signature_dict,word_list):
    for word in set(word_list): # duplicates in the same sentence are not considered (?)
        signature_dict[word]=signature_dict.get(word,0)+1
    return signature_dict

# returns the signature of a synset
# the signature is a dictionary that associates each word with its weight (idf) calculated as idf_i=log(Ndoc/Nd_i)
def get_signature(synset,corpus_examples=None,gemini_examples=None):
    signature={}

    #----------------the definition and examples from wordnet are added---------------
    Ndoc=1 # synset definition
    signature=add_to_signature(signature,preprocess(synset.definition())) # description of the synset
    for wn_ex in synset.examples(): # examples from wordnet
        signature=add_to_signature(signature,preprocess(wn_ex))
        Ndoc+=1

    #----------------the examples from the corpus are added-----------------------
    if corpus_examples: # if there are corpus examples present
        for corpus_ex in corpus_examples: # sentences in SemCor containing the synset
            signature=add_to_signature(signature,preprocess(corpus_ex))
            Ndoc+=1

    #----------------the examples from Gemini are added-----------------------
    if gemini_examples: # if there are examples from Gemini present
        for gemini_ex in gemini_examples:
            signature=add_to_signature(signature,preprocess(gemini_ex))
            Ndoc+=1
            
    # calculation of idf
    for word in signature:
        signature[word]=np.log(Ndoc/signature[word])
    sorted_signature = dict(sorted(signature.items(), key=lambda item: item[1],reverse=True))
    return sorted_signature

# returns the context of a word
# the context consists of the list of lemmatized words of the sentence in which the word to be disambiguated is located
# if the word is a content word, I use its pos tag to assist the lemmatizer
# otherwise it is a stopword and is not lemmatized
def get_context(token_list):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t["word"], t["pos"]) if t["syn"] is not None else t["word"] for t in token_list]


In [9]:
# calculates the overlap between the context and the signature of a synset
# if weighted=True calculates the sum of the weights of the common words
# if weighted=False calculates the number of common words excluding stopwords
def compute_overlap(signature,context,weighted=False):
    if weighted:
        return sum([signature.get(word,0) for word in context])
    else:
        stopword = stopwords.words('english')
        #remove stopwords
        context= [w for w in context if w.casefold() not in stopword]
        signature= {k:v for k,v in signature.items() if k not in stopword}
        return len(set(signature.keys()).intersection(context))

In [10]:
# implementation of the Lesk algorithm
# returns the synset with the maximum overlap between the signature and the context
def lesk(word, pos, sentence, data=None, gemini=False):
    synsets = wn.synsets(word, pos)
    if synsets == []: synsets = wn.synsets(word)  # if the synset with the specified pos tag is not found, use all synsets
    max_overlap = 0
    best_syns = synsets[0]
    context = get_context(sentence)

    # -----------------if specified, extract additional examples (corpus and/or gemini)---------------------
    if data: 
        corpus_examples = get_corpus_examples(synsets, data)
    else: 
        corpus_examples = None
    if gemini:
        gemini_examples = get_gemini_examples(synsets)  # optional part at the end
    else: 
        gemini_examples = None

    # ----------------search for the synset with the greatest overlap---------------------
    for syn in synsets:
        signature = get_signature(syn, 
                                  corpus_examples.get(syn.name(), None) if corpus_examples else None,
                                  gemini_examples.get(syn.name(), None) if gemini_examples else None)
        overlap = compute_overlap(signature, context, weighted=bool(data))
        if overlap > max_overlap:
            max_overlap = overlap
            best_syns = syn

    return best_syns

# baseline, returns the most frequent synset
def naive_wsd(word):
    synsets = wn.synsets(word)
    if synsets == []: 
        return None
    else: 
        return synsets[0]

Usage examples:

In [11]:
#verbose test print
token,token_list =get_random_word(data,only_nouns=True)
print("# word: ",token["word"])
print("# sentence: ",[t["word"]for t in token_list])
print("# extracted context: ",get_context(token_list))
print("# signature of each synset (No corpus examples):")
synsets=wn.synsets(token['word'],pos=token["pos"])
for syn in synsets:
    signature=get_signature(syn)
    overlap=compute_overlap(signature,get_context(token_list))
    print(" -",syn.name()," overlap: ",overlap, " signature: ",signature)

print("\n# signature of each synset (with corpus examples):")
corpus_examples=get_corpus_examples(synsets,data)
for syn in synsets:
    signature=get_signature(syn,corpus_examples[syn.name()])
    overlap=compute_overlap(signature,get_context(token_list),weighted=True)
    print(" -",syn.name()," overlap:",round(overlap,4), " signature: ",signature)

word = token['word']
pos=token['pos']
correct_synset = token['syn'].name()
def check_prediction(prediction):
    return "V" if prediction == correct_synset else "X"
print("-"*30,"\n")
print("# correct synset:", correct_synset)
print("# predicted synset naive:", (predicted_synset := naive_wsd(word).name()), check_prediction(predicted_synset))
print("# predicted synset lesk:", (predicted_synset := lesk(word,pos,token_list).name()), check_prediction(predicted_synset))
print("# predicted synset corpus lesk:", (predicted_synset := lesk(word,pos, token_list,data).name()), check_prediction(predicted_synset))


# word:  son
# sentence:  ['they', 'have', 'a', 'son', 'and', 'a', 'daughter', 'of']
# extracted context:  ['they', 'have', 'a', 'son', 'and', 'a', 'daughter', 'of']
# signature of each synset (No corpus examples):
 - son.n.01  overlap:  1  signature:  {'offspring': 1.0986122886681098, 'human': 1.0986122886681098, 'male': 1.0986122886681098, 'their': 1.0986122886681098, 'son': 1.0986122886681098, 'famous': 1.0986122886681098, 'judge': 1.0986122886681098, 'became': 1.0986122886681098, 'boy': 1.0986122886681098, 'than': 1.0986122886681098, 'his': 1.0986122886681098, 'taller': 1.0986122886681098, 'is': 1.0986122886681098, 'he': 1.0986122886681098, 'a': 0.4054651081081644}
 - son.n.02  overlap:  0  signature:  {'jesus': 0.0, 'of': 0.0, 'divine': 0.0, 'second': 0.0, 'trinity': 0.0, 'the': 0.0, 'in': 0.0, 'word': 0.0, 'person': 0.0, 'god': 0.0, 'incarnate': 0.0}

# signature of each synset (with corpus examples):
 - son.n.01  overlap: 11.207  signature:  {'offspring': 4.1588830833596715, 'hu

---
### Testing
test on 50 phrases from SemCor


In [196]:
#taking all the data from the semcor corpus
data=[[c for c in s] for s in semcor.tagged_sents(tag='both')]

In [12]:
#test on N phrases naive,lesk and corpus_lesk
def testN(data,only_nouns=False,N=50,gemini=False):
    c_naive=0
    c_lesk=0
    c_corpus_lesk=0
    c_gemini_lesk=0
    for i in range(N):
        token,token_list=get_random_word(data,only_nouns)
        correct_syn=token['syn']
        word=token['word']
        pos=token['pos']
        if  naive_wsd(word)==correct_syn:
            c_naive+=1
        if lesk(word,pos,token_list)==correct_syn:
            c_lesk+=1
        if lesk(word,pos,token_list,data)==correct_syn:
            c_corpus_lesk+=1
        if gemini:
            if lesk(word,pos,token_list,data,gemini)==correct_syn:
                c_gemini_lesk+=1
        
    return c_naive/N,c_lesk/N,c_corpus_lesk/N,c_gemini_lesk/N

accuracy=testN(data,only_nouns=True)
print("naive:",accuracy[0],"lesk:",accuracy[1],"corpus_lesk:",accuracy[2])

naive: 0.76 lesk: 0.58 corpus_lesk: 0.96


In [213]:
#execute k=10 times the WSD algorithms on N=50 phrases, and return the average accuracy
def full_test(k=10):
    naive_results=0
    lesk_results=0
    corpus_lesk_results=0
    for i in range(k):
        n,l,cl,_ =testN(data)
        print("Test "+str(i)+" :",n,l,cl)
        naive_results+=n
        lesk_results+=l
        corpus_lesk_results+=cl

    return(round(naive_results/k,3),round(lesk_results/k,3),round(corpus_lesk_results/k,3))

accuracy=full_test() #takes >10 minutes
print("----Avarage accuracy: ----")
print("naive:",accuracy[0],"lesk:",accuracy[1],"corpus_lesk:",accuracy[2])

Test 0 : 0.42 0.52 0.7
Test 1 : 0.34 0.54 0.78
Test 2 : 0.3 0.42 0.72
Test 3 : 0.42 0.5 0.78
Test 4 : 0.36 0.46 0.74
Test 5 : 0.48 0.36 0.8
Test 6 : 0.34 0.56 0.76
Test 7 : 0.38 0.36 0.74
Test 8 : 0.52 0.5 0.74
Test 9 : 0.38 0.34 0.8
----Avarage accuracy: ----
naive: 0.394 lesk: 0.456 corpus_lesk: 0.756


---
### Gemini extension
WSD using Lesk corpus + additional examples generated by Gemini

In [None]:
import google.generativeai as genai

# IMPORTANT: USE A VPN TO ACCESS GEMINI
API_KEY = "abc123"  # insert your own API key
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-pro')
gen_config = genai.types.GenerationConfig(temperature=0.7)  # other parameters are fine by default

# often the security blocks are too restrictive, very often it didn't generate examples, so I set them to none
safe_config = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
    },
  ]

#### Prompt engineering One-shot following the COSTAR framework
- (C) Context: "You are a smart lexicographer who wants to improve the world's best dictionary."
- (O) Objective: "Given a definition and some examples for a list of words, write 5 more examples that best represent the meaning of each word."
- (S) Style: "...a friendly, clear style to make the examples sound natural and easy to understand."
- (T) Tone: "Use a colloquial tone..."
- (A) Audience: "For language learners who seek clear and relatable usage of the words."
- (R) Response: "Write the examples in plain text and insert them into a dictionary in the following format: {"synset1": ["example1", "example2", "example3", "example4", "example5"], "synset2": ["example1",...],...} [...] Do not use newlines."


In [None]:
def get_prompt(synsets):
    prompt="""
    You are a smart lexicographer who wants to improve the world's best dictionary. Given a definition and some examples for a list of word, write 5 more examples that best represent the meaning of each word. 
    Use a colloquial tone and a friendly, clear style to make the examples sound natural and easy to understand for language learners who seek clear and relatable usage of the words
    Write the examples in plain text and insert them into a dictionary in the following format: {"synset1": ["example1", "example2", "example3", "example4", "example5"],"synset2": ["example1",...],...}
    Your response will be used as input for a program, so you MUST respect this format. Do not use newlines.

    **example of input**
    synset= "bank.n.01"
    Term: "bank"
    synonyms: "depository_financial_institution", "banking_concern", "banking_company"
    Definition: "a financial institution that accepts deposits and channels the money into lending activities"
    Examples: ["he cashed a check at the bank", "that bank holds the mortgage on my home"]

    **your new 5 examples**
    Example of Output: {"bank.n.01":["she opened a savings account at the local bank", "the bank approved his loan for the new car", "they went to the bank to deposit their paychecks", "the bank's interest rates for loans are very competitive", "after losing her debit card, she reported it to the bank immediately"]}
    
    **Input:**
    """
    for syn in synsets:
        prompt+="""
        Term: """ + syn.lemmas()[0].name()+ """  
        synonyms: """ + str([lemma.name() for lemma in syn.lemmas()[1:]]) + """
        Definition: """ + str(syn.definition()) + """
        Examples: """ + str(syn.examples()) + """
        """
    prompt+="""
    Output:
    """
    return prompt


In [None]:
import time
# given a list of synsets, returns a dictionary where each synset is associated with a list of usage examples generated with Gemini
def get_gemini_examples(synsets):
    prompt = get_prompt(synsets)
    # Gemini API call
    response = model.generate_content(prompt, generation_config=gen_config, safety_settings=safe_config)
    time.sleep(1)  # to avoid exceeding the request limit
    # if the generation didn't go well (e.g., safety reasons, retry up to 2 times)
    if response.candidates[0].finish_reason > 1:
        for i in range(2):
            print("Error, finish reason: ", response.candidates[0].finish_reason)
            response = model.generate_content(prompt, generation_config=gen_config)
            if response.candidates[0].finish_reason == 1:
                break
        return None
    # check that the obtained string can be converted into a list
    try:
        example_list = eval(response.text)
        return example_list
    except:
        print("Error, output not formatted correctly")
        print(response.text)
        return None

In [200]:
#Example
accuracy=testN(data,only_nouns=False,N=5,gemini=True)
print("naive:",accuracy[0],"lesk:",accuracy[1],"corpus_lesk:",accuracy[2],"gemini_corpus_lesk:",accuracy[3])

naive: 0.4 lesk: 0.6 corpus_lesk: 0.8 gemini_corpus_lesk: 0.8
