# This phase Try to evaluate the WSD as iterative Binary Classification.

- Steps to follow

1. Identify the senses/glosses based on the POS tag
2. Iterative idenitify the sense/gloss based on the approach

In [1]:
import os, yaml, re
from openai import OpenAI, ChatCompletion

In [2]:
with open('cadentials.yaml') as f:
    cadentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['OPENAI_API_KEY'] = cadentials['OPENAI_API_KEY']

In [3]:
client = OpenAI()

In [4]:
def complete_model(USER_MESSAGE):
    response = client.chat.completions.create(
                                            model = 'gpt-3.5-turbo',
                                            messages = [
                                                        {"role": "system", "content" : "You are a helpful assitant to identify the tag for sense of a word"},
                                                        {"role": "user", "content": USER_MESSAGE}              
                                                        ],
                                            temperature=0,
                                            max_tokens=1500
                                            )
    return str(response.choices[0].message.content)

In [5]:
# Read data from file
with open('senses.txt', 'r',encoding="utf8") as file:
    data = file.read()

# Split the data into entries based on empty lines
entries = data.strip().split('\n\n')
print("No of sense tags in FEWS dataset ",len(entries))

# Create a list of lists for each entry's details
list_of_lists = []
for entry in entries:
    details = entry.split('\n')
    entry_list = []
    for detail in details:
        _, value = detail.split(':', 1)
        entry_list.append(value.strip())
    list_of_lists.append(entry_list)

# Print the instance from the sense tag
print(list_of_lists[0])

No of sense tags in FEWS dataset  663730
['dictionary.noun.0', 'dictionary', "A reference work with a list of words from one or more languages, normally ordered alphabetically, explaining each word's meaning, and sometimes containing information on its etymology, pronunciation, usage, translations, and other data.", '', '1', 'wordbook']


In [6]:
#converting the word into the base word as the sense are kept in base
def word_base(word):
    try:
        word=stemmer.stem(word)
        base_word = lemmatizer.lemmatize(word)
        return base_word
    except:
        return word   
    

In [7]:
#function to retrieve the word meaning from the list_of_list list
#this function will specifically read the sense id and the meaning(gloss) which is required for the processing.
def retrieve_meanings(word, data):
    meanings_dict = {}
    for entry in data:
        if word == entry[0].split(".")[0]:
            if entry[-1] !="":
                meanings_dict[entry[0]] = entry[2]+", synonyms :"+entry[-1]
            else:
                meanings_dict[entry[0]] = entry[2]
    return meanings_dict

In [8]:
#Extract the sentence without <wsd> token and the index
def extract_word_and_index(sentence):
    # Find the start and end index of the <WSD> tags
    start_index = sentence.find('<WSD>')    
    end_index = sentence.find('</WSD>')
    

    if start_index != -1 and end_index != -1:
        # Extract the word between <WSD> tags
        word = sentence[start_index + len('<WSD>'):end_index]

        # Remove <WSD> and </WSD> tags from the sentence
        cleaned_sentence = sentence[:start_index] + word+" " + sentence[end_index + len('</WSD>'):]
            
        return cleaned_sentence.strip()

In [9]:
#Function to identify the WSD word from the given sentence and return the WSD word on a sentence
def wsdword(text):
    match = re.search(r'<WSD>(.*?)</WSD>', text)
    if match:
        word_inside_wsd = match.group(1)
        return word_inside_wsd

In [10]:
#Funtion to counts the number of tokens : input and output tokens
from nltk.tokenize import word_tokenize

def count_tokens(text):
    # Tokenize the input text
    tokens = word_tokenize(text)
    
    # Count the tokens in the input
    token_count = len(tokens)
    
    return token_count

In [11]:
#Word lemmatizer to get the base word of the WSD word
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2358452\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
import json
def load_dictionary_from_file(filename):
    try:
        with open(filename, 'r') as file:
            data_dict = json.load(file)
        return data_dict
    except FileNotFoundError:
        # Handle the case where the file doesn't exist
        return None
    except json.JSONDecodeError:
        # Handle the case where the file contains invalid JSON
        return None

In [13]:
def postag_preprocess(pos_tag):
    if pos_tag=="adj":
        return "adjective"
    elif pos_tag=="noun" or pos_tag=="verb":
        return pos_tag
    else:
        return None

In [14]:
loaded_dict = load_dictionary_from_file("my_dictionary.json") 

In [20]:
def sense_Tag_Return_pipeline(sentence,postag,wordwsd):
    
        ob="{"
        cb="}"
        
        meanings = retrieve_meanings(wordwsd, list_of_lists)
        #print(meanings)
        cleaned_sentence= extract_word_and_index(sentence) 
        
        #retrieving the data form the dictionary
        filtered_definitions = {key: value for key, value in meanings.items() if postag in key}
        meaning_list= list(filtered_definitions.items())      
        #print(meaning_list)
        count=0 #variable to count the tokens 
        
        if wsdword(sentence) in loaded_dict:
            #print("examples found")
            examples=loaded_dict[wsdword(sentence)]
            try:
                examples=examples[postag]
            except:
                examples=examples
            

        elif word_base(wsdword(sentence)) in loaded_dict:
            #print("examples found")
            examples=loaded_dict[word_base(wsdword(sentence))]
            try:
                examples=examples[postag]
            except:
                examples=examples
        else:
            examples=None
            
        #print(examples)
        
        output=meaning_list[0]
        count=0 #variable to count the tokens 
        for count_meaning in range(0,len(meaning_list)-1):
            #print("new Iteration")    
            sense_def1=[]
            sense_def2=[]
            
            if(examples != None):
                for example in examples:
                    if output[0] in example:
                        sense_def1.append(example)
                    elif meaning_list[count_meaning+1][0] in example:
                        sense_def2.append(example)

            #print("Sense Examples")  
            #print(sense_def1)
            #print(sense_def2)

                    
            prompt = f'''You are going to identify the most suitable sense tag of an ambiguous word in English sentence.
    1. Below are two possible meanings for word {wordwsd}. Comprehend the sense tags and meanings.
        - 1st meaning {output}
        - 2nd meaning {meaning_list[count_meaning+1]}
    2. You can learn more on the usage of each word and the its sense through the examples below if provided. 
        - 1st meaning Examples : {sense_def1}
        - 2nd meaning Examples : {sense_def2}
    3. Now carefully examine the sentence below. The ambiguous word is enclosed within <WSD>."{sentence}"
    4. Analyze the sentence carefully and identify the meaning of the ambiguous word. Use below techniques
    - Traverse the sentence and identify the important keywords which help to identify the meaning of the ambiguous word. 
    - Then you can think about the contextual meaning of the sentence and decide the most suitable meaning for the ambiguous word.
    5. Based on the identified meaning, try to find the most appropriate senseID from the above two sense ids. Selection of a one sense is mandatory. 
    6. Return only the identified senseid in the below JSON format.
    {ob}"sense_id":  the identified sense id{cb}  ''' 
            
            count+=count_tokens(prompt)
            response=complete_model(prompt)
            #print("Response :")
            #print(response)
            count+=count_tokens(str(output))
            if output[0] in response:
                output=output
            else:
                output=meaning_list[count_meaning+1]

            #print("Output of each iteration:")
            #print(output)


        return output,count
    
    


In [21]:
import datetime
#evaluating the results
filename="missing_data_analysis/binaryclassificationNew"+datetime.date.today().strftime("%B %d, %Y")+".txt"
file=open("missing_data_analysis/missing_test_data_new.txt","r",encoding="utf8")

file2=open(filename,"w",encoding="utf8")
totalTokens=0
for i in file:
    lst=i.split("	")
    sentence,senseid=lst[0],lst[1]
    postag=senseid.split(".")[1]  #postag 
    wordfromtext=senseid.split(".")[0] #wsdword
    #print(postag)
    foutput,token_count=sense_Tag_Return_pipeline(sentence,postag,wordfromtext)
    totalTokens+=token_count
    #print(output)
    print(foutput)
    #writing the output in the file
    file2.write(str(foutput) + "\n")
    

file.close()
file2.close()
print(totalTokens)

('ergative.noun.1', 'An ergative verb or other expression.')
('pubbie.noun.3', 'Someone that one socializes with at the pub.')
('interferant.noun.1', 'A compound in the sample that produces readings which overlap those of the analyte, making analysis more difficult.')
('bean_flicker.noun.0', 'A lesbian.')
('kirtle.verb.0', 'To clothe or cover with, or as if with, a kirtle; to hitch up (a long garment) to the length of a kirtle.')
('draw_a_line_in_the_sand.verb.2', 'To indicate the threshold or level above which something will become unacceptable or will provoke a response; to create a boundary and imply or declare that its crossing will provoke a (negative) response.')
('lay_a_glove_on.verb.0', 'To hit with a boxing glove.')
('dawdle.verb.1', 'To spend (time) without haste or purpose.')
('furnish.verb.1', 'To supply or give (something).')
('splatch.verb.2', 'To move in a manner that causes splashing or spreading of material.')
('desire.verb.1', 'To put a request to (someone); to entrea

In [18]:
import datetime
#evaluating the results
filename="missing_data_analysis/binaryclassification"+datetime.date.today().strftime("%B %d, %Y")+".txt"
file=open("missing_data_analysis/missing_test_data.txt","r",encoding="utf8")

file2=open(filename,"w",encoding="utf8")
totalTokens=0
for i in file:
    lst=i.split("	")
    sentence,senseid=lst[0],lst[1]
    postag=senseid.split(".")[1]  #postag 
    wordfromtext=senseid.split(".")[0] #wsdword
    #print(postag)
    foutput,token_count=sense_Tag_Return_pipeline(sentence,postag,wordfromtext)
    totalTokens+=token_count
    #print(output)
    print(foutput)
    #writing the output in the file
    file2.write(str(foutput) + "\n")
    

file.close()
file2.close()
print(totalTokens)

('corbeau.noun.2', 'A very dark shade of green, almost black.')
('beast.noun.0', 'Any animal other than a human; usually only applied to land vertebrates, especially large or dangerous four-footed ones.')
('brake.noun.1', 'A thicket, or an area overgrown with briers etc. (from 15th c.)')
('godship.noun.1', '(nodot=a): the state, position, or fact of being a god.')
('eyeballer.noun.4', 'Someone who stares at another in order to intimidate them.')
('stage.noun.1', 'A platform; a surface, generally elevated, upon which show performances or other public events are given.')
('part.noun.0', 'A portion; a component.')
('lord.noun.3', 'One possessing similar mastery over others;  any feudal superior generally; any nobleman or aristocrat; any chief, prince, or sovereign ruler; in Scotland, a male member of the lowest rank of nobility (the equivalent rank in England is "baron")')
('red_shirt.noun.8', 'A member of the UDD.')
('glamourama.noun.1', 'An event or setting noted for its glamour or cele

74 instances are corrected using the iterative binary classification