In [17]:
import os, yaml, re
from openai import OpenAI, ChatCompletion

In [18]:
with open('credentials.yaml') as f:
    cadentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['OPENAI_API_KEY'] = cadentials['OPENAI_API_KEY']

In [19]:
client = OpenAI()

In [34]:
def complete_model(USER_MESSAGE):
    response = client.chat.completions.create(
                                            model = 'gpt-4-turbo',
                                            messages = [
                                                        {"role": "system", "content" : "You are a helpful assitant to identify the tag for sense of a word"},
                                                        {"role": "user", "content": USER_MESSAGE}              
                                                        ],
                                            temperature=0,
                                            max_tokens=1500
                                            )
    return str(response.choices[0].message.content)

In [21]:
# Read data from file
with open('senses.txt', 'r',encoding="utf8") as file:
    data = file.read()

# Split the data into entries based on empty lines
entries = data.strip().split('\n\n')
print("No of sense tags in FEWS dataset ",len(entries))

# Create a list of lists for each entry's details
list_of_lists = []
for entry in entries:
    details = entry.split('\n')
    entry_list = []
    for detail in details:
        _, value = detail.split(':', 1)
        entry_list.append(value.strip())
    list_of_lists.append(entry_list)

# Print the instance from the sense tag
print(list_of_lists[0])

No of sense tags in FEWS dataset  663730
['dictionary.noun.0', 'dictionary', "A reference work with a list of words from one or more languages, normally ordered alphabetically, explaining each word's meaning, and sometimes containing information on its etymology, pronunciation, usage, translations, and other data.", '', '1', 'wordbook']


In [22]:
#converting the word into the base word as the sense are kept in base
def word_base(word):
    try:
        word=stemmer.stem(word)
        base_word = lemmatizer.lemmatize(word)
        return base_word
    except:
        return word   
    

In [23]:
#function to retrieve the word meaning from the list_of_list list
#this function will specifically read the sense id and the meaning(gloss) which is required for the processing.
def retrieve_meanings(word, data):
    meanings_dict = {}
    for entry in data:
        if word == entry[0].split(".")[0]:
            if entry[-1] !="":
                meanings_dict[entry[0]] = entry[2]+", synonyms :"+entry[-1]
            else:
                meanings_dict[entry[0]] = entry[2]
    return meanings_dict

In [24]:
#Extract the sentence without <wsd> token and the index
def extract_word_and_index(sentence):
    # Find the start and end index of the <WSD> tags
    start_index = sentence.find('<WSD>')    
    end_index = sentence.find('</WSD>')
    

    if start_index != -1 and end_index != -1:
        # Extract the word between <WSD> tags
        word = sentence[start_index + len('<WSD>'):end_index]

        # Remove <WSD> and </WSD> tags from the sentence
        cleaned_sentence = sentence[:start_index] + word+" " + sentence[end_index + len('</WSD>'):]
            
        return cleaned_sentence.strip()

In [25]:
#Function to identify the WSD word from the given sentence and return the WSD word on a sentence
def wsdword(text):
    match = re.search(r'<WSD>(.*?)</WSD>', text)
    if match:
        word_inside_wsd = match.group(1)
        return word_inside_wsd

In [26]:
#Funtion to counts the number of tokens : input and output tokens
from nltk.tokenize import word_tokenize

def count_tokens(text):
    # Tokenize the input text
    tokens = word_tokenize(text)
    
    # Count the tokens in the input
    token_count = len(tokens)
    
    return token_count

In [27]:
#Word lemmatizer to get the base word of the WSD word
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2358452\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
import json
def load_dictionary_from_file(filename):
    try:
        with open(filename, 'r') as file:
            data_dict = json.load(file)
        return data_dict
    except FileNotFoundError:
        # Handle the case where the file doesn't exist
        return None
    except json.JSONDecodeError:
        # Handle the case where the file contains invalid JSON
        return None

In [29]:
def postag_preprocess(pos_tag):
    if pos_tag=="adj":
        return "adjective"
    elif pos_tag=="noun" or pos_tag=="verb":
        return pos_tag
    else:
        return None

In [30]:
loaded_dict = load_dictionary_from_file("my_dictionary.json")

In [31]:
def sense_Tag_Return_pipeline(sentence,postag,wordwsd):
    #print(sentence)
    word=word_base(wsdword(sentence))    
    #print("The WSD Word with base: ",word)
    #meanings = retrieve_meanings(word_base(wordwsd), list_of_lists)
    meanings = retrieve_meanings(wordwsd, list_of_lists)

    cleaned_sentence= extract_word_and_index(sentence) 
    #retrieving the data form the dictionary
    filtered_definitions = {key: value for key, value in meanings.items() if postag in key}
    
    if wsdword(sentence) in loaded_dict:
        #print("examples found")
        examples=loaded_dict[wsdword(sentence)]
        try:
            examples=examples[postag]
        except:
            examples=examples
        

    elif word_base(wsdword(sentence)) in loaded_dict:
        #print("examples found")
        examples=loaded_dict[word_base(wsdword(sentence))]
        try:
            examples=examples[postag]
        except:
            examples=examples
    else:
        examples=None
        

    count=0 #variable to count the tokens 
    
    #prompt=f"Examine the sentence. {instance_meaning}.Return most suitable sense id associated with from below. it contain sense id and it's definition {meanings}. utilize the below examples also to finalize the answer {examples}"
    prompt = f'''You are going to identify the corresponding sense tag of an ambiguous word in English sentences. Use multiple reasoning strategies to increase confidence in your answer.
1. The word "{wordwsd}" has different meanings. Below are possible meanings. Comprehend the sense tags and meanings. Synonyms are provided if available. {filtered_definitions}
2. You can learn more on the usage of each word and the its sense through the examples below. Each sentence is followed by its corresponding sense id. "{examples}"
3. Now carefully examine the sentence below. The ambiguous word is enclosed within <WSD>."{sentence}"
4. Analyze the sentence using the following techniques and identify the meaning of the ambiguous word.
   Focus on keywords in the sentence surrounding the ambiguous word. 
   Think about the overall topic and intent of the sentence. Decide on the sense of the word that makes the most logical sense within the context. 
5. Based on the identified meaning, try to find the most appropriate senseIDs from the below sense tag list. You are given definition of each sense tag too."{filtered_definitions}".
6. If you have more than one senseIDs identified after above steps, you can return the senseIDs in order of confident level, follow the given format to return the value .
7.Return only the finalized senseIDs. Do not add extra details and explanation. Only senseIDS expected.

if only one senseid is identified then <senseId>
if not <senseId1, senseId2,...> '''
    #print(prompt)
    count+=count_tokens(prompt)
    output=complete_model(prompt)
    count+=count_tokens(output)
    return output, count    


In [32]:
import json
import datetime
#evaluating the results
filename="data/devFewShot"+datetime.date.today().strftime("%B %d, %Y")+".txt"
file=open("data/devfewshot.txt","r",encoding="utf8")

file2=open(filename,"w",encoding="utf8")
totalTokens=0
for i in file:
    lst=i.split("	")
    sentence,senseid=lst[0],lst[1]
    postag=senseid.split(".")[1]  #postag 
    wordfromtext=senseid.split(".")[0] #wsdword
    #print(postag)
    output,token_count=sense_Tag_Return_pipeline(sentence,postag,wordfromtext)
    totalTokens+=token_count
        
    #print(output)
    print(output)
    #writing the output in the file
    file2.write(output + "\n")
    

file.close()
file2.close()
print(totalTokens)

ceremony.noun.3
rhematic.adjective.1
sign-on.noun.0
illinois.noun.1
spoof.verb.1
anacrisis.noun.1
patronymy.noun.2
scintillate.verb.0
refurb.noun.1
veronan.noun.0
breathiness.noun.0
biset.noun.1
kronecker_delta.noun.0
overcall.noun.2
blister.verb.1
ecumenopolitan.adjective.1
pencil_dick.noun.0
<baba.noun.3>
tippex.noun.0
endurable.adjective.1
christian.adjective.2
trial_by_fire.noun.0
unhumanity.noun.1
ponderous.adjective.4
epsilontics.noun.1
ultrarich.adjective.5
inflector.noun.2
zygon.noun.0
<waste.verb.4>
houri.noun.0
calorize.verb.0
<senseId1>
blancmange.noun.1
spoils.verb.1
verb.noun.3
batman.verb.1
sexperience.noun.1
<damp.noun.0>
australia.noun.0
choke.verb.7
<strong-handed.adjective.0, strong-handed.adjective.5>
<squire.verb.1>
take_a_turn.verb.2
yak.noun.0
monoblack.adjective.1
<mobot.noun.1>
herd.noun.2
armhook.noun.1
chic.noun.1
worldhood.noun.1
causal-final.adjective.0
malignant.adjective.0
cod.adjective.1
agitation.noun.3
foppery.noun.0
devious.adjective.1
fifth_wall.noun.

In [35]:
import json
import datetime
#evaluating the results
filename="data/devFewShotgpt4"+datetime.date.today().strftime("%B %d, %Y")+".txt"
file=open("data/devgpt4cases.txt","r",encoding="utf8")

file2=open(filename,"w",encoding="utf8")
totalTokens=0
for i in file:
    lst=i.split("	")
    sentence,senseid=lst[0],lst[1]
    postag=senseid.split(".")[1]  #postag 
    wordfromtext=senseid.split(".")[0] #wsdword
    #print(postag)
    output,token_count=sense_Tag_Return_pipeline(sentence,postag,wordfromtext)
    totalTokens+=token_count
        
    #print(output)
    print(output)
    #writing the output in the file
    file2.write(output + "\n")
    

file.close()
file2.close()
print(totalTokens)

illinois.noun.2
spoof.verb.2
anacrisis.noun.0
patronymy.noun.1
baba.noun.1
ultrarich.adjective.2
<waste.verb.4>
rooftop.noun.0
blancmange.noun.0
<batman.verb.0>
sexperience.noun.1
<strong-handed.adjective.1>
causal-final.adjective.0
foppery.noun.0
unfledged.adjective.2
anthropism.noun.0
fall_between_two_stools.verb.0
gunmetal.noun.2
dub.noun.5
downward.adjective.0
apodictic.adjective.1
never.adverb.0
<god.noun.12>
<overdo.verb.3>
patient.adjective.1
coldness.noun.3
assoon.adverb.0
<gripe.verb.5>
stagger.verb.4
light_up.verb.0
work.noun.7
lowly.adjective.1
connive.verb.0
line.noun.39
nature.noun.4
condition.verb.5
wingdom.noun.0
thread.noun.1
propriety.noun.6
exhilarate.verb.1
indifferent.adjective.4
illegitimate.adjective.3
meed.noun.0
reconvert.verb.0
edge.noun.5
law_unto_oneself.noun.1
ruthfully.adverb.1
<hand.noun.8>
<breakdown.noun.5>
tostado.noun.0
innovate.verb.0
huckle.noun.2
physics.noun.1
drive-in.noun.0
mongery.noun.1
billy_bunter.noun.0
<christian.noun.1>
<assay.verb.3>
<mis