In [1]:
import nltk
import sys
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.stem.porter import PorterStemmer
from nltk.wsd import lesk
from nltk.corpus import stopwords
from nltk.corpus import brown

## Training Data Collection

In [10]:
# Gets root of the xml file
def get_root_of_training_file(training_file_name):
    xml_file = ET.parse(training_file_name)
    return xml_file.getroot()

# Get all text of a given node. before and after target word.
def get_node_text(node):
    if node.text:
        result = node.text
    else:
        result = ''
    for child in node:
        if child.tail is not None:
            result += child.tail
    return result
    
# Get all the sentences in the lexsub.xml file that have .n in their target word.
def get_noun_sentences_to_parse(root):
    sentences = {}
    for lexelt in root:
        target = lexelt.attrib['item']
        target = target.split('.')
        target_tags = target[1:len(target)]
        if 'n' in target_tags:
            target_word = target[0]
            for instance in lexelt:
                instance_id = instance.attrib['id']
                for context in instance:
                    sentence_text = []
                    if context.text:
                        sentence_text.append(context.text)
                    else: 
                        sentence_text.append('')
                    for child in context:
                        sentence_text.append(child.tail)
                        sentence_text.append(child.text)
                sentences[instance_id] = sentence_text
    return sentences
        
# Get PoS tags for every noun_sentence given
def get_pos_for_sentences(noun_sentences):
    parsed_sentences = {}
    for key in noun_sentences:
        sentence_to_find = noun_sentences[key][0] + noun_sentences[key][2] + noun_sentences[key][1]
        parsed_sentences[key] = get_pos_tags(sentence_to_find)
    return parsed_sentences

# Get PoS tags for a given sentence using nltk package
def get_pos_tags(sentence_to_find):                
    tokenized = nltk.word_tokenize(sentence_to_find)
    return nltk.pos_tag(tokenized)            

## Preprocess sentence:

In [3]:
# Use a stemmer on the target word
def stem_target_word(word):
    stemmer = PorterStemmer()
    stemword = stemmer.stem(word)
    if wn.synsets(stemword) is None:
        stemword = word
    return stemword

# Return sentence in lowercase
def sentences_to_lowercase(sentence):
    return ([segment.lower() for segment in sentence])

# Preprocess a given sentence with it stemmed target word and lowercase sentence.
def preprocess_sentence(sentence):
    lowercase_sentence = sentences_to_lowercase(sentence)
    stemmed_target_word = stem_target_word(lowercase_sentence[2])
    return lowercase_sentence,stemmed_target_word


## Word Similarity 

### Lesk Algorithm

In [23]:
# Parse whole sentence into segments
def parse_sentence_to_array(sentence):
    return (sentence[0] + sentence[2] + sentence[1]).split()

# Run the lesk algorithm on a given sentence and target word
def perform_lesk(sentence,word):
    return lesk(sentence,word,'n')

# Create a replacement dictionary for lesk.
def create_lesk_replacement_dictionary(noun_sentences):
    replacement_dictionary = {}
    # For every sentence
    for sentence in noun_sentences:
        # place sentence to lowercase and stem the target word. 
        preprocessed_sentence,target_word = preprocess_sentence(noun_sentences[sentence])     
        # Run the lesk algorithm to get the WSD and sense of the target word.
        context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),target_word)
        
        # this is to catch any errors from the preprocessing and make sure we get an answer back
        if context_synset is None:
            context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),noun_sentences[sentence][2])
            
        # For every lemma on the word sense
        for lemma in context_synset.lemmas():
            # If it doesn't equal the target word already
            if target_word != lemma.name():
                # Get the replacement word
                replacement_word = lemma.name()
                # Check if there is a split in the sentence
                if '_' in replacement_word:
                    split_word = ''
                    for word in replacement_word.split('_'):
                        split_word += ' ' + word
                    replacement_word = split_word[1:]
                # Set the dictionary sentence id to equal the replacement.
                replacement_dictionary[sentence] = replacement_word
                break;
    return replacement_dictionary

### Graph sense prediction

In [24]:
# Preprocess post tag sentences. This removes all stop words from the sentence as well as punctuation.
def preprocess_pos_tags(pos_tags):
    processed_tags = {}
    for key in pos_tags:
        current_sentence = pos_tags[key]
        text_sentence = [(sentence[0].lower(),sentence[1]) for sentence in current_sentence]
        stopwords_removed =  [(w[0],w[1]) for w in text_sentence if not w[0] in stopwords.words('english')]
        punctuation = ['!',',','.','?',')','(']
        punctuation_removed = [(w[0],w[1]) for w in stopwords_removed if not w[0] in punctuation]
        processed_tags[key] = punctuation_removed
    return processed_tags
        
# Get all synsets based on the POS tag and given word
def get_synsets(pos_tuple):
    wn_pos_code = wordnet_pos_code(pos_tuple[1])
    if wn_pos_code is not None:
        return wn.synsets(pos_tuple[0],wn_pos_code)
    
# Map POST tags to wordnet tags.
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wn.NOUN
    elif tag.startswith('VB'):
        return wn.VERB
    elif tag.startswith('JJ'):
        return wn.ADJ
    elif tag.startswith('RB'): 
        return wn.ADV
    else:
        return None
                
# Create a dictionary graph for every synset within the sentence.
def create_graph(sentence):
    graph = {}
    for pos_tuple in sentence:
        synsets = get_synsets(pos_tuple)
        if synsets is not None:
            for s in synsets:
                graph[s] = [];
    return graph      

# Given a synset, recursively build a tree based on its lexical and semantic relations .
def build_synset_tree(level,synset,tree,max_level=4):
    if synset not in tree and level != 0:
        tree.append(synset)
    if level <= max_level:
        # Get Hyponyms
        for s in synset.hyponyms():
            build_synset_tree(level+1,s,tree)
        # Get Norminalisations
        for l in synset.lemmas():
            related_forms = l.derivationally_related_forms()
            for rf in related_forms:
                build_synset_tree(level+1,rf.synset(),tree) 
        return tree
    else:
        return tree  
        
# With all scores being calculated, chose the highest scoring senses, if 0 then pick first.
def get_senses_for_sentence(current_sentence,scores):
    word_sensed_tags = []
    # for tuple in current sentence
    for pos_tuple in current_sentence:
        # get the synsets of tuple
        synsets = get_synsets(pos_tuple)
        # if it's not none
        if synsets is not None: 
            if len(synsets) != 0:
                max_score = 0
                max_synset = synsets[0]
                for s in synsets:
                    if scores[s] < 0 and scores[s] > max_score:
                        max_score = scores[s]
                        max_synset = s
                word_sensed_tags.append((pos_tuple,max_synset))
            else:
                word_sensed_tags.append((pos_tuple,None))
        else:
            word_sensed_tags.append((pos_tuple,None))
    return word_sensed_tags

def get_target_word_replacement(senses,noun_sentence):
    for sense_tuple in senses:
        if sense_tuple[0][0] == noun_sentence[2]:
            synset = sense_tuple[1]
            word = ''
            if synset.lemmas()[0].name() != noun_sentence[2].lower():
                word = synset.lemmas()[0].name()
            else:
                if len(synset.lemmas()) > 1:
                    word = synset.lemmas()[1].name()
                else:
                    word = synset.lemmas()[0].name()
            if '_' in word:
                word = word.split('_')
                split_word = ''
                for split in word:
                    split_word += split + ' '
                word = split_word
            return word
                         
def run_graph_word_sense(noun_sentences,pos_tags):
    pos_tags = preprocess_pos_tags(pos_tags)
    replacement_words = {}
    for key in pos_tags:
        current_sentence = pos_tags[key]
        synset_graph = create_graph(current_sentence)
        for synset in synset_graph:
            tree = build_synset_tree(0,synset,[])
            for s in tree:
                if s in synset_graph.keys():
                    edges = synset_graph[synset]
                    if s not in edges:
                        edges.append(s)
                        synset_graph[synset] = edges
                        edges = synset_graph[s]
                        edges.append(synset)
                        synset_graph[s] = edges
        scores = {}
        for synset in synset_graph:
            degree = len(synset_graph[synset])/(len(synset_graph.keys())-1)
            scores[synset] = degree
        
        word_senses = get_senses_for_sentence(current_sentence,scores)
        replacement_words[key] = get_target_word_replacement(word_senses,noun_sentences[key])
    return replacement_words

## Create Evaluation Set

In [25]:
def load_in_gold_standard():
    with open('assignment_resources/gold.trial') as f:
        content = f.readlines()
    content.pop(0)
    content = [x.strip() for x in content]
    return [x.split() for x in content]

def get_noun_answers(content_array):
    noun_sentences = {}
    for evaluation in content_array:
        given_tag = (evaluation[0].split("."))[1]
        if (given_tag is 'n'):
            instance_id = evaluation[1]
            noun_sentences[instance_id] = evaluation
    return noun_sentences

def create_answer_tuples(noun_dictionary):
    for key in noun_dictionary:
        # Gold standard row and sliced answers
        gold_standard_row = noun_dictionary[key]
        gold_standard_answers = gold_standard_row[3:len(gold_standard_row)]
        noun_dictionary[key] = answer_list_to_tuples(gold_standard_answers)
    return noun_dictionary

def answer_list_to_tuples(gold_standard_answers):   
        answer_tuples = []
        i = 0
        while i <(len(gold_standard_answers)):
            if i is 0:
                word = gold_standard_answers[i]
            elif i is len(gold_standard_answers)-1:
                break;
            else:
                word = (gold_standard_answers[i].split(';'))[1]
            if check_if_tail_word(gold_standard_answers[i+1]):
                index,end_of_tail_word = get_tail_words(i,gold_standard_answers)
                word += end_of_tail_word
                mark = gold_standard_answers[index].split(';')[0]
                i = index
            else:
                mark = gold_standard_answers[i+1].split(';')[0]
                i += 1
            answer_tuples.append((word,mark))
        return answer_tuples
            
def check_if_tail_word(possible_word):
    return ';' not in possible_word

def get_tail_words(current_index,answer_list):
    found_last_tail = False
    index = current_index + 1
    tail_word = ''
    while found_last_tail is not True:
        if check_if_tail_word(answer_list[index]):
            tail_word += (' '+ answer_list[index])
            index +=1
        else:
            found_last_tail = True
    return index, tail_word


## Score evaluation

In [26]:
def score_replacements(replacement_dictionary,answer_dictionary):
    scores = {}
    max_score = 0
    for key in replacement_dictionary:
        replacement_word = replacement_dictionary[key]
        if key in answer_dictionary.keys():
            gold_standard_replacements = answer_dictionary[key]
        else:
            continue;
        word_max_score = 0
        for word,score in gold_standard_replacements:
            if int(score) > word_max_score:
                word_max_score = int(score)
            if replacement_word == word:
                scores[key] = int(score)
        max_score += word_max_score
        if key not in scores.keys():
            scores[key] = 0
    return max_score, scores 


def get_zero_scores(result_dictionary):
    bad_scores = {}
    for key in result_dictionary:
        score = result_dictionary[key]
        if score == 0:
            bad_scores[key] = 0
    return bad_scores

## Main Method

In [31]:
def main():
    
    # Training sentences
    training_file_name = 'assignment_resources/lexsub_trial.xml'
    root_of_training = get_root_of_training_file(training_file_name)
    noun_sentences = get_noun_sentences_to_parse(root_of_training)
    
    # POS training sentences
    map_parsed_to_nouns = get_pos_for_sentences(noun_sentences)


    graph_replacement_dictionary = run_graph_word_sense(noun_sentences,map_parsed_to_nouns)    
    # lesk_replacement_dictionary = create_lesk_replacement_dictionary(noun_sentences)

    
    # Load Evaluation Set
    gold_standard_contents = load_in_gold_standard()
    noun_dictionary = get_noun_answers(gold_standard_contents)
    answer_dictionary = create_answer_tuples(noun_dictionary)
    
    # Evaluate 
    max_score, scores = score_replacements(graph_replacement_dictionary,answer_dictionary)
    final_score = sum(scores.values())
    percentage = (final_score/max_score)*100
    poor_predictions = get_zero_scores(scores)

    print ('-'*40)
    print ('The max score that can be achieved for the sentences that were changed is: ',max_score)
    print ('-'*40)
    print ('The final score of the given replacements is:', final_score, '/', max_score , '(',percentage,'%)')
    print ('-'*40)
    print ('The following sentences were given a score of zero:')
    for key in poor_predictions:
        print ('-'*40)
        if graph_replacement_dictionary[key] != None:
            print(key ,': ',noun_sentences[key][0] + graph_replacement_dictionary[key] + noun_sentences[key][1])
        else:
            print('You could not find a suitable prediction for this sentence and returned None')
        print('The Gold Standard answers included:')
        for answer,mark in answer_dictionary[key]:
            print (answer,'-', mark)
        print('The original target word for this sentence was',noun_sentences[key][2])
        print('You decided that the best word to use was', graph_replacement_dictionary[key])
        print ('-'*40)
        

In [32]:
main()

----------------------------------------
The max score that can be achieved for the sentences that were changed is:  222
----------------------------------------
The final score of the given replacements is: 58 / 222 ( 26.126126126126124 %)
----------------------------------------
The following sentences were given a score of zero:
----------------------------------------
41 :  That 's not a very high barroom .
The Gold Standard answers included:
marker - 1
level - 1
barrier - 1
pole - 1
obstruction - 1
hurdle - 1
The original target word for this sentence was bar
You decided that the best word to use was barroom
----------------------------------------
----------------------------------------
42 :  This more upright position is most easily and affordably achieved through slapping a riser bar on your setup , and only requires you to buy a bar instead of a barroom and stem .
The Gold Standard answers included:
handlebar - 1
The original target word for this sentence was bar
You decided 