# Assignment 1: Calculating the mutual information score

Import Packages
(remember to install SpaCy and download the language model from the terminal)

In [1]:
import os
import re
import spacy
# Loading language model into pipeline
nlp = spacy.load("en_core_web_sm")

## Load the text

In [2]:
# Loading a text from data folder - get a new text - this one is too long
filepath = os.path.join("Data", "Doyle_Hound_1902.txt")
with open(filepath, "r") as f:
    txt = f.read()

## Normalization

Remove punctuation

In [3]:
no_punct = re.sub("[^\w\s]", '', txt)
no_numbers = re.sub("\d", '', no_punct)
no_newline = re.sub("\s+", ' ', no_numbers) 

Spacy

In [4]:
doc = nlp(no_newline)

## Search Keyword

In [6]:
# Printing the keyword
keyword = "murder"

search_result = []
for token in doc: 
    if token.lemma_ == keyword:
        search_result.append((token.i, token.text, token.pos_, token.lemma_))
    else:
        pass
print(search_result)

[(44743, 'murder', 'NOUN', 'murder'), (45662, 'murdered', 'VERB', 'murder'), (50328, 'murder', 'NOUN', 'murder'), (51087, 'murders', 'NOUN', 'murder'), (55584, 'murder', 'NOUN', 'murder'), (58681, 'murder', 'NOUN', 'murder'), (58950, 'murder', 'VERB', 'murder')]


## Find Context Words

In [7]:
# Printing context words
window = 5

context_words = []
for token in doc: 
    if token.lemma_ == keyword:
        # Context words before keyword
        for before_word in doc[token.i-(window):token.i]:
            context_words.append((before_word.i, before_word.text, before_word.pos_, before_word.lemma_))
        # Context words after keyword    
        for after_word in doc[token.i+1:token.i+window+1]:
            context_words.append((after_word.i, after_word.text, after_word.pos_, after_word.lemma_))
    else:
        pass
print(context_words)          

[(44738, 'as', 'SCONJ', 'as'), (44739, 'he', 'PRON', 'he'), (44740, 'answered', 'VERB', 'answer'), (44741, 'It', 'PRON', 'it'), (44742, 'is', 'AUX', 'be'), (44744, 'Watson', 'PROPN', 'Watson'), (44745, 'refined', 'ADJ', 'refined'), (44746, 'coldblooded', 'VERB', 'coldbloode'), (44747, 'deliberate', 'ADJ', 'deliberate'), (44748, 'mur', 'ADJ', 'mur'), (45657, 'Uncle', 'PROPN', 'Uncle'), (45658, 'and', 'CCONJ', 'and'), (45659, 'nephew', 'NOUN', 'nephew'), (45660, 'have', 'AUX', 'have'), (45661, 'been', 'AUX', 'be'), (45663, 'the', 'DET', 'the'), (45664, 'one', 'NOUN', 'one'), (45665, 'frightened', 'VERB', 'frighten'), (45666, 'to', 'ADP', 'to'), (45667, 'death', 'NOUN', 'death'), (50323, 'this', 'DET', 'this'), (50324, 'case', 'NOUN', 'case'), (50325, 'as', 'ADP', 'as'), (50326, 'one', 'NUM', 'one'), (50327, 'of', 'ADP', 'of'), (50329, 'and', 'CCONJ', 'and'), (50330, 'the', 'DET', 'the'), (50331, 'evidence', 'NOUN', 'evidence'), (50332, 'may', 'AUX', 'may'), (50333, 'implicate', 'VERB', '

## Calculate Mutual Information score

In [11]:
# Creating a list of word lemmas
lemma_list = []

for token in doc: 
    lemma_list.append(token.lemma_)

print(lemma_list[0:10])

['Arthur', 'Conan', 'Doyle', 'the', 'Hound', 'of', 'the', 'Baskervilles', 'Chapter', 'Mr']


In [56]:
# Import math package
import math

# Create list of context words
context_words_lemma = [sublist[3] for sublist in context_words]

# Define variables in the MI formula that does not need a for-loop
A = lemma_list.count(keyword)
size_corpus = len(lemma_list)
span = window*2

# Defining empty list for output
collocate_info = []

for word in context_words_lemma:
    B = lemma_list.count(word)
    AB = context_words_lemma.count(word)
    MI = math.log10( (AB*size_corpus) / (A * B * span) ) / math.log10(2)
    collocate_info.append((word, B, AB, MI))

print(collocate_info)

[('as', 405, 2, 2.072012007292924), ('he', 1233, 1, -0.5341669793379281), ('answer', 33, 1, 4.689395985706458), ('it', 980, 1, -0.20284783393765946), ('be', 2505, 4, 0.4431952169823404), ('Watson', 112, 1, 2.9264351830073068), ('refined', 1, 1, 9.73379010506491), ('coldbloode', 1, 1, 9.73379010506491), ('deliberate', 1, 1, 9.73379010506491), ('mur', 1, 1, 9.73379010506491), ('Uncle', 1, 1, 9.73379010506491), ('and', 1627, 2, 0.06579156939238331), ('nephew', 1, 1, 9.73379010506491), ('have', 1225, 1, -0.5247759288250218), ('be', 2505, 4, 0.4431952169823404), ('the', 3329, 5, 0.35484504481201024), ('one', 247, 2, 2.7854228734802335), ('frighten', 6, 1, 7.148827604343754), ('to', 1409, 3, 0.8582967094764306), ('death', 64, 1, 3.733790105064911), ('this', 337, 2, 2.3371853238830527), ('case', 78, 1, 3.448387886202663), ('as', 405, 2, 2.072012007292924), ('one', 247, 2, 2.7854228734802335), ('of', 1593, 2, 0.09625955353960121), ('and', 1627, 2, 0.06579156939238331), ('the', 3329, 5, 0.35484

## Save to CSV

In [59]:
# Import pandas
import pandas as pd
# create a dataframe
collocate_df = pd.DataFrame(collocate_info, columns=["collocate_term", "app_in_text", "app_in_context", "MI"])

In [60]:
# Saving CSV
outpath = os.path.join("Output", "collocate_df.csv")
collocate_df.to_csv(outpath, index=False)