In [None]:
import nltk
import sys
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.stem.porter import PorterStemmer
from nltk.wsd import lesk

## Training Data Collection

In [141]:
def get_root_of_training_file(training_file_name):
    xml_file = ET.parse(training_file_name)
    return xml_file.getroot()

def get_node_text(node):
    if node.text:
        result = node.text
    else:
        result = ''
    for child in node:
        if child.tail is not None:
            result += child.tail
    return result
    
def get_noun_sentences_to_parse(root):
    sentences = {}
    for lexelt in root:
        target = lexelt.attrib['item']
        target = target.split('.')
        target_tags = target[1:len(target)]
        if 'n' in target_tags:
            target_word = target[0]
            for instance in lexelt:
                instance_id = instance.attrib['id']
                for context in instance:
                    sentence_text = []
                    if context.text:
                        sentence_text.append(context.text)
                    else: 
                        sentence_text.append('')
                    for child in context:
                        sentence_text.append(child.tail)
                        sentence_text.append(child.text)
                sentences[instance_id] = sentence_text
    return sentences
        

## Preprocess sentence:

In [140]:
def stem_target_word(word):
    stemmer = PorterStemmer()
    stemword = stemmer.stem(word)
    if wn.synsets(stemword) is None:
        stemword = word
    return stemword

def sentences_to_lowercase(sentence):
    return ([segment.lower() for segment in sentence])

def preprocess_sentence(sentence):
    lowercase_sentence = sentences_to_lowercase(sentence)
    stemmed_target_word = stem_target_word(lowercase_sentence[2])
    return lowercase_sentence,stemmed_target_word

## Word Similarity 

### Lesk Algorithm

In [142]:
def parse_sentence_to_array(sentence):
    return (sentence[0] + sentence[2] + sentence[1]).split()

def perform_lesk(sentence,word):
    return lesk(sentence,word,'n')

def create_replacement_dictionary(noun_sentences):
    replacement_dictionary = {}
    # For every sentence
    for sentence in noun_sentences:
        # place target word to lowercase.
        preprocessed_sentence,target_word = preprocess_sentence(noun_sentences[sentence])     
        # Get the context synset first using preprocessing
        context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),target_word)
        
        if context_synset is None:
            context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),noun_sentences[sentence][2])
            
        for lemma in context_synset.lemmas():
            if target_word != lemma.name():
                replacement_word = lemma.name()
                if '_' in replacement_word:
                    split_word = ''
                    for word in replacement_word.split('_'):
                        split_word += ' ' + word
                    replacement_word = split_word[1:]
                replacement_dictionary[sentence] = replacement_word
                break;
    return replacement_dictionary

### Most Frequent Sense

### Graph sense prediction

## Create Evaluation Set

In [101]:
def load_in_gold_standard():
    with open('assignment_resources/gold.trial') as f:
        content = f.readlines()
    content.pop(0)
    content = [x.strip() for x in content]
    return [x.split() for x in content]

def get_noun_answers(content_array):
    noun_sentences = {}
    for evaluation in content_array:
        given_tag = (evaluation[0].split("."))[1]
        if (given_tag is 'n'):
            instance_id = evaluation[1]
            noun_sentences[instance_id] = evaluation
    return noun_sentences

def create_answer_tuples(noun_dictionary):
    for key in noun_dictionary:
        # Gold standard row and sliced answers
        gold_standard_row = noun_dictionary[key]
        gold_standard_answers = gold_standard_row[3:len(gold_standard_row)]
        noun_dictionary[key] = answer_list_to_tuples(gold_standard_answers)
    return noun_dictionary

def answer_list_to_tuples(gold_standard_answers):   
        answer_tuples = []
        i = 0
        while i <(len(gold_standard_answers)):
            if i is 0:
                word = gold_standard_answers[i]
            elif i is len(gold_standard_answers)-1:
                break;
            else:
                word = (gold_standard_answers[i].split(';'))[1]
            if check_if_tail_word(gold_standard_answers[i+1]):
                index,end_of_tail_word = get_tail_words(i,gold_standard_answers)
                word += end_of_tail_word
                mark = gold_standard_answers[index].split(';')[0]
                i = index
            else:
                mark = gold_standard_answers[i+1].split(';')[0]
                i += 1
            answer_tuples.append((word,mark))
        return answer_tuples
            
def check_if_tail_word(possible_word):
    return ';' not in possible_word

def get_tail_words(current_index,answer_list):
    found_last_tail = False
    index = current_index + 1
    tail_word = ''
    while found_last_tail is not True:
        if check_if_tail_word(answer_list[index]):
            tail_word += (' '+ answer_list[index])
            index +=1
        else:
            found_last_tail = True
    return index, tail_word


## Score evaluation

In [154]:
def score_replacements(replacement_dictionary,answer_dictionary):
    scores = {}
    max_score = 0
    for key in replacement_dictionary:
        replacement_word = replacement_dictionary[key]
        if key in answer_dictionary.keys():
            gold_standard_replacements = answer_dictionary[key]
        else:
            continue;
        word_max_score = 0
        for word,score in gold_standard_replacements:
            if int(score) > word_max_score:
                word_max_score = int(score)
            if replacement_word == word:
                scores[key] = int(score)
        max_score += word_max_score
        if key not in scores.keys():
            scores[key] = 0
    return max_score, scores 


def get_zero_scores(result_dictionary):
    bad_scores = {}
    for key in result_dictionary:
        score = result_dictionary[key]
        if score == 0:
            bad_scores[key] = 0
    return bad_scores

## Main Method

In [155]:
def main():
    
    # Training sentences
    training_file_name = 'assignment_resources/lexsub_trial.xml'
    root_of_training = get_root_of_training_file(training_file_name)
    noun_sentences = get_noun_sentences_to_parse(root_of_training)
    
    # Training parsed sentences
#     parsed_training_file_name = 'assignment_resources/lexsub_trial.parsed.xml'
#     root_of_parsed_training = get_root_of_training_file(parsed_training_file_name)
#     parsed_noun_sentences = get_parsed_noun_sentences(root_of_parsed_training,noun_sentences)
    
    
    replacement_dictionary = create_replacement_dictionary(noun_sentences)

    # Load Evaluation Set
    gold_standard_contents = load_in_gold_standard()
    noun_dictionary = get_noun_answers(gold_standard_contents)
    answer_dictionary = create_answer_tuples(noun_dictionary)
    
    # Evaluate 
    max_score, scores = score_replacements(replacement_dictionary,answer_dictionary)
    final_score = sum(scores.values())
    percentage = (final_score/max_score)*100
    poor_predictions = get_zero_scores(scores)

    print ('-'*40)
    print ('The max score that can be achieved for the sentences that were changed is: ',max_score)
    print ('-'*40)
    print ('The final score of the given replacements is:', final_score, '/', max_score , '(',percentage,'%)')
    print ('-'*40)
    print ('The following sentences were given a score of zero:')
    for key in poor_predictions:
        print ('-'*40)
        print(key ,': ',noun_sentences[key][0] + replacement_dictionary[key] + noun_sentences[key][1])
        print('The Gold Standard answers included:')
        for answer,mark in answer_dictionary[key]:
            print (answer,'-', mark)
        print('The original target word for this sentence was',noun_sentences[key][2])
        print('You decided that the best word to use was', replacement_dictionary[key])
        print ('-'*40)
        

In [156]:
main()

['So , unlike studio films , independent ', ' cannot be conceptually geared to a marketing campaign , or used to recruit merchandising tie-ins .', 'films']
['so , unlike studio films , independent ', ' cannot be conceptually geared to a marketing campaign , or used to recruit merchandising tie-ins .', 'films']
['The packed screening of about 100 high-level press people loved the ', ' as well .', 'film']
['the packed screening of about 100 high-level press people loved the ', ' as well .', 'film']
['I think most filmmakers for the most part right now they’re thinking about the DVD when they go in to production because the reality is DVD is where the great majority of your audience is going to experience the ', ' .', 'film']
['i think most filmmakers for the most part right now they’re thinking about the dvd when they go in to production because the reality is dvd is where the great majority of your audience is going to experience the ', ' .', 'film']
['Dune makes the second Lynch ', ' t