In [1]:
import nltk
import sys
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.stem.porter import PorterStemmer
from nltk.wsd import lesk
from nltk.corpus import stopwords
from nltk.corpus import brown

## Training Data Collection

In [2]:
def get_root_of_training_file(training_file_name):
    xml_file = ET.parse(training_file_name)
    return xml_file.getroot()

def get_node_text(node):
    if node.text:
        result = node.text
    else:
        result = ''
    for child in node:
        if child.tail is not None:
            result += child.tail
    return result
    
def get_noun_sentences_to_parse(root):
    sentences = {}
    for lexelt in root:
        target = lexelt.attrib['item']
        target = target.split('.')
        target_tags = target[1:len(target)]
        if 'n' in target_tags:
            target_word = target[0]
            for instance in lexelt:
                instance_id = instance.attrib['id']
                for context in instance:
                    sentence_text = []
                    if context.text:
                        sentence_text.append(context.text)
                    else: 
                        sentence_text.append('')
                    for child in context:
                        sentence_text.append(child.tail)
                        sentence_text.append(child.text)
                sentences[instance_id] = sentence_text
    return sentences
        
def get_pos_for_sentences(noun_sentences):
    parsed_sentences = {}
    for key in noun_sentences:
        sentence_to_find = noun_sentences[key][0] + noun_sentences[key][2] + noun_sentences[key][1]
        parsed_sentences[key] = get_sentence_from_parsed(sentence_to_find)
    return parsed_sentences
  
def get_sentence_from_parsed(sentence_to_find):                
    tokenized = nltk.word_tokenize(sentence_to_find)
    return nltk.pos_tag(tokenized)
    
                
     


## Preprocess sentence:

In [3]:
def stem_target_word(word):
    stemmer = PorterStemmer()
    stemword = stemmer.stem(word)
    if wn.synsets(stemword) is None:
        stemword = word
    return stemword

def sentences_to_lowercase(sentence):
    return ([segment.lower() for segment in sentence])

def preprocess_sentence(sentence):
    lowercase_sentence = sentences_to_lowercase(sentence)
    stemmed_target_word = stem_target_word(lowercase_sentence[2])
    return lowercase_sentence,stemmed_target_word


## Word Similarity 

### Lesk Algorithm

In [4]:
def parse_sentence_to_array(sentence):
    return (sentence[0] + sentence[2] + sentence[1]).split()

def perform_lesk(sentence,word):
    return lesk(sentence,word,'n')

def create_lesk_replacement_dictionary(noun_sentences):
    replacement_dictionary = {}
    # For every sentence
    for sentence in noun_sentences:
        # place target word to lowercase.
        preprocessed_sentence,target_word = preprocess_sentence(noun_sentences[sentence])     
        # Get the context synset first using preprocessing
        context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),target_word)
        
        if context_synset is None:
            context_synset = perform_lesk(parse_sentence_to_array(preprocessed_sentence),noun_sentences[sentence][2])
            
        for lemma in context_synset.lemmas():
            if target_word != lemma.name():
                replacement_word = lemma.name()
                if '_' in replacement_word:
                    split_word = ''
                    for word in replacement_word.split('_'):
                        split_word += ' ' + word
                    replacement_word = split_word[1:]
                replacement_dictionary[sentence] = replacement_word
                break;
    return replacement_dictionary

### Graph sense prediction

In [185]:
def preprocess_pos_tags(pos_tags):
    processed_tags = {}
    for key in pos_tags:
        current_sentence = pos_tags[key]
        text_sentence = [(sentence[0].lower(),sentence[1]) for sentence in current_sentence]
        stopwords_removed =  [(w[0],w[1]) for w in text_sentence if not w[0] in stopwords.words('english')]
        punctuation = ['!',',','.','?',')','(']
        punctuation_removed = [(w[0],w[1]) for w in stopwords_removed if not w[0] in punctuation]
        processed_tags[key] = punctuation_removed
    return processed_tags
        
def get_synsets(pos_tuple):
    wn_pos_code = wordnet_pos_code(pos_tuple[1])
    if wn_pos_code is not None:
        return wn.synsets(pos_tuple[0],wn_pos_code)
    
def create_graph(sentence):
    graph = {}
    for pos_tuple in sentence:
        synsets = get_synsets(pos_tuple)
        if synsets is not None:
            for s in synsets:
                graph[s] = [];
    return graph      

def build_synset_tree(level,synset,tree,max_level=2):
    if synset not in tree and level != 0:
        tree.append(synset)
    if level <= max_level:
        # Get Hyponyms
        for s in synset.hyponyms():
            build_synset_tree(level+1,s,tree)
        # Get Norminalisations
        for l in synset.lemmas():
            related_forms = l.derivationally_related_forms()
            for rf in related_forms:
                build_synset_tree(level+1,rf.synset(),tree) 
        return tree
    else:
        return tree  
        
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wn.NOUN
    elif tag.startswith('VB'):
        return wn.VERB
    elif tag.startswith('JJ'):
        return wn.ADJ
    elif tag.startswith('RB'): 
        return wn.ADV
    else:
        return None
                
def get_senses_for_sentence(current_sentence,scores):
    word_sensed_tags = []
    # for tuple in current sentence
    for pos_tuple in current_sentence:
        # Get the synsets of tuple
        synsets = get_synsets(pos_tuple)
        # If it's not none
        if synsets is not None: 
            if len(synsets) != 0:
                max_score = 0
                max_synset = synsets[0]
                for s in synsets:
                    if scores[s] < 0 and scores[s] > max_score:
                        max_score = scores[s]
                        max_synset = s
                word_sensed_tags.append((pos_tuple,max_synset))
            else:
                word_sensed_tags.append((pos_tuple,None))
        else:
            word_sensed_tags.append((pos_tuple,None))
    return word_sensed_tags

def get_target_word_replacement(senses,noun_sentence):
    for sense_tuple in senses:
        if sense_tuple[0][0] == noun_sentence[2]:
            synset = sense_tuple[1]
            return synset.lemmas()[0].name()
                         
def run_graph_word_sense(noun_sentences,pos_tags):
    pos_tags = preprocess_pos_tags(pos_tags)
    replacement_words = {}
    for key in pos_tags:
        current_sentence = pos_tags[key]
        synset_graph = create_graph(current_sentence)
        for synset in synset_graph:
            tree = build_synset_tree(0,synset,[])
            for s in tree:
                if s in synset_graph.keys():
                    edges = synset_graph[synset]
                    if s not in edges:
                        edges.append(s)
                        synset_graph[synset] = edges
                        edges = synset_graph[s]
                        edges.append(synset)
                        synset_graph[s] = edges
        scores = {}
        for synset in synset_graph:
            degree = len(synset_graph[synset])/(len(synset_graph.keys())-1)
            scores[synset] = degree
        
        word_senses = get_senses_for_sentence(current_sentence,scores)
        replacement_words[key] = get_target_word_replacement(word_senses,noun_sentences[key])
        print(replacement_words[key])
    return replacement_words

## Create Evaluation Set

In [186]:
def load_in_gold_standard():
    with open('assignment_resources/gold.trial') as f:
        content = f.readlines()
    content.pop(0)
    content = [x.strip() for x in content]
    return [x.split() for x in content]

def get_noun_answers(content_array):
    noun_sentences = {}
    for evaluation in content_array:
        given_tag = (evaluation[0].split("."))[1]
        if (given_tag is 'n'):
            instance_id = evaluation[1]
            noun_sentences[instance_id] = evaluation
    return noun_sentences

def create_answer_tuples(noun_dictionary):
    for key in noun_dictionary:
        # Gold standard row and sliced answers
        gold_standard_row = noun_dictionary[key]
        gold_standard_answers = gold_standard_row[3:len(gold_standard_row)]
        noun_dictionary[key] = answer_list_to_tuples(gold_standard_answers)
    return noun_dictionary

def answer_list_to_tuples(gold_standard_answers):   
        answer_tuples = []
        i = 0
        while i <(len(gold_standard_answers)):
            if i is 0:
                word = gold_standard_answers[i]
            elif i is len(gold_standard_answers)-1:
                break;
            else:
                word = (gold_standard_answers[i].split(';'))[1]
            if check_if_tail_word(gold_standard_answers[i+1]):
                index,end_of_tail_word = get_tail_words(i,gold_standard_answers)
                word += end_of_tail_word
                mark = gold_standard_answers[index].split(';')[0]
                i = index
            else:
                mark = gold_standard_answers[i+1].split(';')[0]
                i += 1
            answer_tuples.append((word,mark))
        return answer_tuples
            
def check_if_tail_word(possible_word):
    return ';' not in possible_word

def get_tail_words(current_index,answer_list):
    found_last_tail = False
    index = current_index + 1
    tail_word = ''
    while found_last_tail is not True:
        if check_if_tail_word(answer_list[index]):
            tail_word += (' '+ answer_list[index])
            index +=1
        else:
            found_last_tail = True
    return index, tail_word


## Score evaluation

In [187]:
def score_replacements(replacement_dictionary,answer_dictionary):
    scores = {}
    max_score = 0
    for key in replacement_dictionary:
        replacement_word = replacement_dictionary[key]
        if key in answer_dictionary.keys():
            gold_standard_replacements = answer_dictionary[key]
        else:
            continue;
        word_max_score = 0
        for word,score in gold_standard_replacements:
            if int(score) > word_max_score:
                word_max_score = int(score)
            if replacement_word == word:
                scores[key] = int(score)
        max_score += word_max_score
        if key not in scores.keys():
            scores[key] = 0
    return max_score, scores 


def get_zero_scores(result_dictionary):
    bad_scores = {}
    for key in result_dictionary:
        score = result_dictionary[key]
        if score == 0:
            bad_scores[key] = 0
    return bad_scores

## Main Method

In [190]:
def main():
    
    # Training sentences
    training_file_name = 'assignment_resources/lexsub_trial.xml'
    root_of_training = get_root_of_training_file(training_file_name)
    noun_sentences = get_noun_sentences_to_parse(root_of_training)
    
    # POS training sentences
#     map_parsed_to_nouns = get_pos_for_sentences(noun_sentences)
#     graph_replacement_dictionary = run_graph_word_sense(noun_sentences,map_parsed_to_nouns)
    

    
    # Training parsed sentences
#     parsed_training_file_name = 'assignment_resources/lexsub_trial.parsed.xml'
#     root_of_parsed_training = get_root_of_training_file(parsed_training_file_name)
#     parsed_noun_sentences = get_parsed_noun_sentences(root_of_parsed_training,noun_sentences)
    
    
    lesk_replacement_dictionary = create_lesk_replacement_dictionary(noun_sentences)
    print(lesk_replacement_dictionary)
    

#     # Load Evaluation Set
#     gold_standard_contents = load_in_gold_standard()
#     noun_dictionary = get_noun_answers(gold_standard_contents)
#     answer_dictionary = create_answer_tuples(noun_dictionary)
    
#     # Evaluate 
#     max_score, scores = score_replacements(replacement_dictionary,answer_dictionary)
#     final_score = sum(scores.values())
#     percentage = (final_score/max_score)*100
#     poor_predictions = get_zero_scores(scores)

#     print ('-'*40)
#     print ('The max score that can be achieved for the sentences that were changed is: ',max_score)
#     print ('-'*40)
#     print ('The final score of the given replacements is:', final_score, '/', max_score , '(',percentage,'%)')
#     print ('-'*40)
#     print ('The following sentences were given a score of zero:')
#     for key in poor_predictions:
#         print ('-'*40)
#         print(key ,': ',noun_sentences[key][0] + replacement_dictionary[key] + noun_sentences[key][1])
#         print('The Gold Standard answers included:')
#         for answer,mark in answer_dictionary[key]:
#             print (answer,'-', mark)
#         print('The original target word for this sentence was',noun_sentences[key][2])
#         print('You decided that the best word to use was', replacement_dictionary[key])
#         print ('-'*40)
        

In [191]:
main()

{'11': 'plastic film', '12': 'movie', '13': 'movie', '14': 'movie', '15': 'movie', '16': 'movie', '17': 'movie', '18': 'movie', '19': 'movie', '20': 'movie', '43': 'legal profession', '44': 'legal profession', '45': 'legal profession', '47': 'legal profession', '49': 'legal profession', '50': 'Browning automatic rifle', '48': 'legal profession', '51': 'Cross', '52': 'Cross', '53': 'Cross', '54': 'crisscross', '55': 'hybridization', '56': 'Cross', '57': 'hybridization', '58': 'hybridization', '59': 'hybridization', '60': 'hybridization', '160': 'wilderness', '161': 'canful', '162': 'canful', '163': 'canful', '164': 'tin', '165': 'buttocks', '166': 'buttocks', '167': 'toilet', '168': 'canful', '169': 'toilet', '170': 'toilet', '171': 'night', '172': 'night', '173': 'night', '174': 'night', '175': 'night', '176': 'night', '177': 'night', '178': 'night', '179': 'night', '180': 'night', '181': 'examination', '182': 'examination', '183': 'interrogation', '184': 'examination', '185': 'examina