In [1]:
import os, yaml, re
from openai import OpenAI, ChatCompletion

In [2]:
with open('credentials.yaml') as f:
    cadentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['OPENAI_API_KEY'] = cadentials['OPENAI_API_KEY']

In [3]:
client = OpenAI()

In [5]:
def complete_model(USER_MESSAGE):
    response = client.chat.completions.create(
                                            model = 'gpt-4o',
                                            messages = [
                                                        {"role": "system", "content" : "You are a helpful assitant to identify the senseID for an ambigous word"},
                                                        {"role": "user", "content": USER_MESSAGE}              
                                                        ],
                                            temperature=0,
                                            max_tokens=500
                                            )
    return str(response.choices[0].message.content)

In [6]:
# Read data from file
with open('newsense.txt', 'r',encoding="utf8") as file:
    data = file.read()

# Split the data into entries based on empty lines
entries = data.strip().split('\n\n')
print("No of sense tags in FEWS dataset ",len(entries))

# Create a list of lists for each entry's details
list_of_lists = []
for entry in entries:
    details = entry.split('\n')
    entry_list = []
    for detail in details:
        _, value = detail.split(':', 1)
        entry_list.append(value.strip())
    list_of_lists.append(entry_list)

# Print the instance from the sense tag
print(list_of_lists[0])

No of sense tags in FEWS dataset  40
['apple.noun.0', 'apple', 'A common, round fruit produced by the tree "Malus domestica", cultivated in temperate climates. (from 9th c.)', '', '1', '']


In [7]:
#function to retrieve the word meaning from the list_of_list list
#this function will specifically read the sense id and the meaning(gloss) which is required for the processing.
def retrieve_meanings(word, data):
    file=open("fewmapping.txt","r",encoding="utf-8")
    info=file.read()
    meanings_dict = {}
    for entry in data:
        if word == entry[0].split(".")[0] and entry[0] in info:
            if entry[-1] !="":
                meanings_dict[entry[0]] = entry[2]+", synonyms :"+entry[-1]
            else:
                meanings_dict[entry[0]] = entry[2]
    return meanings_dict

In [8]:
import csv
from transformers import AutoTokenizer, AutoModel
import torch

# Load the BGE small model and tokenizer
MODEL_NAME = "BAAI/bge-small-en"  # Example small model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

def encode_sentence(sentence):
    """Encodes a sentence into a vector using the BGE small model."""
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().tolist()

In [9]:
import json
import numpy as np
# Load the dictionary from the JSON file
with open("my_dictionary.json", "r", encoding="utf-8") as json_file:
    data_dict = json.load(json_file)

In [10]:
def find_most_relevant_sentences(input_sentence, target_word):
    
    # Load the dictionary from the JSON file
    with open("my_dictionary.json", "r", encoding="utf-8") as json_file:
        data_dict = json.load(json_file)

    # validation for target word
    if target_word not in data_dict:
        return f"The word '{target_word}' is not found in the dictionary."

    # Encode the input sentence
    input_vector = np.array(encode_sentence(input_sentence))

    # Compute similarity scores
    entries = data_dict[target_word]
    similarity_scores = [
        (np.dot(input_vector, np.array(entry["vector"])) /
         (np.linalg.norm(input_vector) * np.linalg.norm(entry["vector"])), entry["sentence"])
        for entry in entries
    ]

    # Sort by similarity scores (descending)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[0], reverse=True)

    # Retrieve the top 3 most similar sentences
    top_3_sentences = sorted_scores[:3]

    # Format the results : Only sentence is returned
    results = []
    for score, sentence in top_3_sentences:
        results.append(f"{sentence}")

    return "\n".join(results)

In [11]:
def sense_Tag_Return_pipeline(sentence,wordwsd):
    filtered_definitions= retrieve_meanings(wordwsd,list_of_lists)
    word=" "+wordwsd
    examples= find_most_relevant_sentences(sentence,word)


    
    #prompt=f"Examine the sentence. {instance_meaning}.Return most suitable sense id associated with from below. it contain sense id and it's definition {meanings}. utilize the below examples also to finalize the answer {examples}"
    prompt = f'''You are going to identify the corresponding sense tag of an ambiguous word in English sentences. Use multiple reasoning strategies to increase confidence in your answer.
1. The word "{wordwsd}" has different meanings. Below are possible meanings. Comprehend the sense tags and meanings. Synonyms are provided if available. {filtered_definitions}
2. You can learn more on the usage of each word and the its sense through the examples below. Each sentence is followed by its corresponding sense id. "{examples}"
3. Now carefully examine the sentence below. The ambiguous word is {wordwsd}. {sentence}
4. Analyze the sentence using the following techniques and identify the meaning of the ambiguous word. 
   Focus on keywords in the sentence surrounding the ambiguous word. 
   Think about the overall topic and intent of the sentence. Decide on the sense of the word that makes the most logical sense within the context. 
5. Based on the identified meaning, try to find the most appropriate senseIDs from the sense tag list provided.
6. Return only the finalized senseID. Do not add extra details and explanation. Only senseIDS expected.
 '''
    #print(prompt)
    output=complete_model(prompt)
   
    return output  


In [12]:
import json
#evaluating the results
with open('WSD_1.csv', 'r',encoding='latin-1') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        
        sentence= row['sentence']
        word=row['senseid'].split(".")[0]
        #print(sentence)
        #print(word)
        output=sense_Tag_Return_pipeline(sentence,word)
        print(output)

file.close()     
    



apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.nou

In [13]:
import json
#evaluating the results
with open('WSD_3.csv', 'r',encoding='latin-1') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        
        sentence= row['sentence']
        word=row['senseid'].split(".")[0]
        #print(sentence)
        #print(word)
        output=sense_Tag_Return_pipeline(sentence,word)
        print(output)

file.close()     
    

apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.0
apple.noun.12
apple.noun.0
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.0
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.12
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.0
apple.noun.12
apple.noun.0
apple.noun.