In [22]:
# Importing all necessary files
from lxml import etree, objectify
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import pandas as pd 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
!pip install swifter
import swifter

In [2]:
# function to get sense from original dictionary: 
def getSenses(word, pos):
    #global Tree
    item = Tree.xpath("//lexelt[@item='%s.%s']" % (word, pos))    
    senses = []
    if len(item) >= 1:
        for sense in item[0].getchildren():
            senses.append(dict(zip(sense.keys(), sense.values())))
    return senses

In [3]:
# functions to clean all the datasets
#rename columns and ignore the index column
def rename_columns(dataset):
    dataset_new = dataset.rename(columns = {0:"Target_Word", 1:"Sense_ID", 2:"Sentence"})
    dataset_new = dataset_new.reset_index(drop=True)
    return dataset_new

In [4]:
#convert sentence column to lower case, remove digits and punctuations
def lowercase_cleaned_data(dataset, colname):
    stop = stopwords.words('english')
    string.punctuation = string.punctuation.replace('%', '')
    dataset["lowercase_cleaned"] = dataset[colname].apply(lambda words: ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in words.split()))
    dataset["lowercase_cleaned"] = dataset["lowercase_cleaned"].str.replace('\d+', '')
    return dataset

In [5]:
#retrieving pos for the words and lemmatisation
def retreive_pos_wordnet(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sentence = ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in sentence.split())
    list_words = sentence.split()
    final_list = []
    for i in range (len(list_words)):
        tag = nltk.pos_tag(list_words)[i][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        final_tag = tag_dict.get(tag, wordnet.NOUN)
        lemmatized_word = lemmatizer.lemmatize(list_words[i],final_tag)
        final_list.append([list_words[i],final_tag,lemmatized_word])
    return final_list

In [6]:
# function to remove stop words and words with length < 3
def remove_stop_words_from_pos(pos_input_list):
    return_list = []
    stop = stopwords.words('english')
    for pos in pos_input_list:
        if (pos[2] not in stop and (len(pos[2])>2 or pos[2]=="%%")):
            return_list.append(pos)
    return return_list

In [7]:
# function to clean dictionary
def lemmatize_sentences(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sentence = ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in sentence.split())
    list_words = sentence.split()
    lemmatize_words = ''
    for i in range (len(list_words)):
        tag = nltk.pos_tag(list_words)[i][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        final_tag = tag_dict.get(tag, wordnet.NOUN)
        lemmatize_words += " " + lemmatizer.lemmatize(list_words[i],final_tag)   
    return lemmatize_words.strip()

In [8]:
def getLemmaExamplesFromSenseDict(word_pos, sense):
    word_pos = word_pos.strip()
    lemmaSenseKey = word_pos+ "_"+sense.get('id')
    sense_examples = ""
    if (lemmaSenseKey in SenseLemmaDictionary):
        sense_examples = SenseLemmaDictionary.get(lemmaSenseKey)
    else:
        sense_examples = (
            lemmatize_sentences(sense.get('gloss').lower())
            + " | "
            + ('.'.join(lemmatize_sentences(sentence.lower()) for sentence in sense.get('examples').split(".")))
        )
        SenseLemmaDictionary[lemmaSenseKey] = sense_examples
    return sense_examples

In [26]:
# Model 1 : Simple lesk Algorithm
def calculate_sense_model_one(target_word, pos_data):
    #print(target_word)
    target_data = target_word.split(".")
    senses = getSenses(target_data[0].strip(), target_data[1].strip())
    score_map = {}
    pos_sentence = []
    for pos_word in pos_data:
        pos_sentence.append(pos_word[2])
    
    for sense in senses:
        sense_score = 0
        sense_examples = getLemmaExamplesFromSenseDict(target_word, sense)
        sense_example_words = sense_examples.split()
        common = set(sense_example_words).intersection( set(pos_sentence) )
        score_map[sense.get('id')] = len(common)
    
    key_max = max(score_map, key=score_map.get)
    return key_max

In [14]:
# Calculating accuracies of all models
def calculate_accuracy(dataframe, column_name):
    accuracy_number = 0
    i=0
    for index, row in dataframe.iterrows():
        if(int(row['Sense_ID'])==int(row[column_name])):
            accuracy_number += 1
        i += 1
    return ((accuracy_number/i)*100)

# Exporting to CSV
def exportToCSV(input_data_frame, csv_path):
    tmp_df = input_data_frame.drop(['Sentence', 'lowercase_cleaned', 'pos_data'], axis=1)
    tmp_df.to_csv(csv_path, index = False)

In [None]:
# Model 2 : Orginal lesk Algorithm
def limitizeContextMapModelTwo(context_sense):
    dictionary_examples = ""
    for context_data in context_sense:
        for sense_data in context_data[2]:
            #dictionary_examples += lemmatize_sentences(sense_data.get('gloss').lower())+ " | " + lemmatize_sentences(sense_data.get('examples').lower())
            dictionary_examples += getLemmaExamplesFromSenseDict(context_data[1], sense_data)
    return dictionary_examples


def getContextDictModelTwo(target_data, pos_data, corpus=False):
    context_sense = []
    target_sense = []
    sentence = pos_data
    sentence_length = len(sentence)
    target_word = target_data.split(".")[0]
    target_pos = target_data.split(".")[1]
    for k in range(len(sentence)):
        if sentence[k][0] == "%%":
            target_index = k-1
            targetWord = sentence[target_index][0]
            break
    
    i = target_index-2
    j = target_index+2
    k = 0
    while((i>=0 or j<len(sentence)) and k<30):   
        if(i>=0 and len(sentence[i][2].strip())>= 3 and sentence[i][2].strip() != target_word):
            context_word = sentence[i][2].strip()
            context_pos = sentence[i][1].strip()
            if(corpus):
                sense = getNewSenses(context_word,context_pos)
            else:
                sense = getSenses(context_word,context_pos)
            
            if len(sense) >= 1:
                context_sense.append([targetWord,context_word+"."+context_pos,sense, target_index-i])
            
        if(j<len(sentence) and len(sentence[j][2].strip())>= 3 and sentence[j][2].strip() != target_word):
            context_word = sentence[j][2].strip()
            context_pos = sentence[j][1].strip()
            if(corpus):
                sense = getNewSenses(context_word,context_pos)
            else:
                sense = getSenses(context_word,context_pos)
                
            if len(sense) >= 1:
                context_sense.append([target_word,context_word+"."+context_pos,sense, j-target_index])
            
        i = i-1
        j = j+1
        k = k+1
                
    return context_sense


def calculateSenseIdModelTwo(target_word_pos, pos_without_stopwords):
    print(target_word_pos)
    target_word_details = target_word_pos.split(".")
    target_senses = getSenses(target_word_details[0].strip(), target_word_details[1].strip())
    score_map = {}
    context_sentence = limitizeContextMapModelTwo(getContextDictModelTwo(target_word_pos, pos_without_stopwords))
    
    for sense in target_senses:
        #sense_examples = lemmatize_sentences(sense.get('gloss').lower())+ " | " + lemmatize_sentences(sense.get('examples').lower())
        sense_examples = getLemmaExamplesFromSenseDict(target_word_pos.strip(), sense)
        sense_example_words = sense_examples.split()
        context_example_words = context_sentence.split()

        common = set(sense_example_words).intersection( set(context_example_words) )
        context_score = len(common)
        score_map[sense.get('id')] = context_score
    
    key_max = max(score_map, key=score_map.get)
    return key_max

In [16]:
# Making the new augemented dictionary by adding training data for corpus lesk
def newDictionary():
    parser = objectify.makeparser(recover=True)
    tree = objectify.fromstring(''.join(open('dictionary.xml').readlines()), parser)
    train_data_new = rename_columns(train_data)
    for index, row in train_data_new.iterrows():
        target_word = row['Target_Word'].strip()
        sense_id = str(row['Sense_ID'])
        sentence_to_add = row['Sentence']
        
        item = tree.xpath("//lexelt[@item='%s']" % (target_word))
        
        for item_sense in item[0].getchildren():
            if (str(item_sense.attrib['id']) == sense_id):
                item_sense.attrib['examples'] = item_sense.attrib['examples'] + sentence_to_add

    xml_new = etree.tostring(tree, pretty_print=True)
    # save your xml
    with open(r"new_dictionary.xml", "wb") as f:
        f.write(xml_new)



In [9]:
def getLemmaExamplesFromCorpusSenseDict(word_pos, sense):
    word_pos = word_pos.strip()
    lemmaSenseKey = word_pos+ "_"+sense.get('id')
    sense_examples = ""
    if (lemmaSenseKey in SenseLemmaCorpusDictionary):
        sense_examples = SenseLemmaCorpusDictionary.get(lemmaSenseKey)
    else:
        sense_examples = (
            lemmatize_sentences(sense.get('gloss').lower())
            + " | "
            + ('.'.join(lemmatize_sentences(sentence.lower()) for sentence in sense.get('examples').split(".")))
        )
        SenseLemmaCorpusDictionary[lemmaSenseKey] = sense_examples
    return sense_examples

In [15]:
# function to get sense from new dictionary
## TOBE: pass the Tree as the paramter and combine into 1 function
def getNewSenses(word, pos):
    #global TreeNew
    item = TreeNew.xpath("//lexelt[@item='%s.%s']" % (word, pos))    
    senses = []
    if len(item) >= 1:
        for sense in item[0].getchildren():
            senses.append(dict(zip(sense.keys(), sense.values())))
    return senses

In [None]:
# Main function
if __name__ == "__main__":
    global Tree
    global TreeNew
    global SenseLemmaDictionary
    global SenseLemmaCorpusDictionary
    SenseLemmaDictionary = {}
    SenseLemmaCorpusDictionary = {}
    # Read the dictionary file - original 
    Parser = objectify.makeparser(recover=True)
    Tree = objectify.fromstring(''.join(open('dictionary.xml').readlines()), Parser)
    
    #read test data
    train_data = pd.read_csv (r'train.data',header=None,delimiter = "|")
    test_data = pd.read_csv (r'test.data',header=None,delimiter = "|")
    validation_data = pd.read_csv (r'validate.data',header=None,delimiter = "|")

    #rename columns for all the datasets
    train_data_new = rename_columns(train_data)
    test_data_new = rename_columns(test_data)
    validation_data_new = rename_columns(validation_data)
    
    #create new dictionary
    #newDictionary()
    #ParserNew = objectify.makeparser(recover=True)
    #TreeNew = objectify.fromstring(''.join(open('new_dictionary.xml').readlines()), ParserNew)
    
    ################################# Validation data ###################################
    # validation set cleaning process
    method_one_validation_df = validation_data_new
    method_one_validation_df = lowercase_cleaned_data(method_one_validation_df, 'Sentence')
    method_one_validation_df["pos_data"] = method_one_validation_df['lowercase_cleaned'].swifter.apply(lambda sentence: retreive_pos_wordnet(sentence))
    method_one_validation_df["pos_data"] = method_one_validation_df["pos_data"].swifter.apply(lambda pos_data_list: remove_stop_words_from_pos(pos_data_list))
    
    # Model 1 - Simple lesk
    method_one_validation_df['simple_lesk_sense_id'] = method_one_validation_df.swifter.apply(lambda x: calculate_sense_model_one(x['Target_Word'], x['pos_data']), axis=1)
    
    """
    # Model 2 - Original Lesk
    method_two_validation_df = method_one_validation_df
    method_two_validation_df['original_lesk_sense_id'] = method_two_validation_df.swifter.apply(lambda x: calculateSenseIdModelTwo(x['Target_Word'], x['pos_data']), axis=1)
    
    # Model 3 - Advance original lesk
    method_three_validation_df = method_two_validation_df
    method_three_validation_df['adv_original_lesk_sense_id'] = method_three_validation_df.swifter.apply(lambda x: calculateSenseIdModel3(x['Target_Word'], x['pos_data']), axis=1)
    
    # Model 4 - Corpus lesk
    method_four_validation_df = method_three_validation_df
    method_four_validation_df['corpus_lesk_sense_id'] = method_four_validation_df.swifter.apply(lambda x: calculate_sense_corpus_model_one(x['Target_Word'], x['pos_data']), axis=1)
    
    # Model 5 - Adv Corpus lesk
    method_five_validation_df = method_four_validation_df
    method_five_validation_df['adv_corpus_lesk_sense_id'] = method_five_validation_df.swifter.apply(lambda x: calculateSenseIdModel3(x['Target_Word'], x['pos_data'], True), axis=1)
    """
    
    print("Accuracy of validation data for simple_lesk: " + str(calculate_accuracy(method_one_validation_df, "simple_lesk_sense_id")))
    """
    print("Accuracy of validation data for original_lesk: " + str(calculate_accuracy(method_two_validation_df, "original_lesk_sense_id")))
    print("Accuracy of validation data for adv_original_lesk: " + str(calculate_accuracy(method_three_validation_df, "adv_original_lesk_sense_id")))
    print("Accuracy of validation data for corpus_lesk: " + str(calculate_accuracy(method_four_validation_df, "corpus_lesk_sense_id")))
    print("Accuracy of validation data for adv_corpus_lesk: " + str(calculate_accuracy(method_five_validation_df, "adv_corpus_lesk_sense_id")))
    """

    # Export validation results to CSV
    exportToCSV(method_one_validation_df, r'validation_results_SimpleLesk.csv')
    

In [None]:
    ################################### Test data ######################################
    
    # test set cleaning process
    method_one_test_df = test_data_new
    method_one_test_df = lowercase_cleaned_data(method_one_test_df, 'Sentence')
    method_one_test_df["pos_data"] = method_one_test_df['lowercase_cleaned'].swifter.apply(lambda sentence: retreive_pos_wordnet(sentence))
    method_one_test_df["pos_data"] = method_one_test_df["pos_data"].swifter.apply(lambda pos_data_list: remove_stop_words_from_pos(pos_data_list))
    
    # Model 1 - Simple Lesk
    method_one_test_df['simple_lesk_sense_id'] = method_one_test_df.swifter.apply(lambda x: calculate_sense_model_one(x['Target_Word'], x['pos_data']), axis=1)
    

    # Export validation results to CSV
    exportToCSV(method_one_test_df, r'test_data_results_SimpleLesk.csv')
    

In [None]:
    ################################### Training data ######################################
    
    # train data cleaning process
    method_one_train_df = train_data_new
    method_one_train_df = lowercase_cleaned_data(method_one_train_df, 'Sentence')
    method_one_train_df["pos_data"] = method_one_train_df['lowercase_cleaned'].swifter.apply(lambda sentence: retreive_pos_wordnet(sentence))
    method_one_train_df["pos_data"] = method_one_train_df["pos_data"].swifter.apply(lambda pos_data_list: remove_stop_words_from_pos(pos_data_list))
    
    # Model 1 - Simple Lesk
    method_one_train_df['simple_lesk_sense_id'] = method_one_train_df.swifter.apply(lambda x: calculate_sense_model_one(x['Target_Word'], x['pos_data']), axis=1)
    
     # Calculating accuracies of training data
    print("Accuracy of training data for simple_lesk: " + str(calculate_accuracy(method_one_train_df, "simple_lesk_sense_id")))

     # Export validation results to CSV
    exportToCSV(method_one_train_df, r'training_data_results.csv')
    

Pandas Apply:   0%|          | 0/22281 [00:00<?, ?it/s]