# Streamline code

Import modules

In [45]:
import os
import re
import spacy
import math
# Import pandas
import pandas as pd
# Loading language model into pipeline
nlp = spacy.load("en_core_web_sm")

## Load the text

In [3]:
# Loading a text from data folder - get a new text - this one is too long
filepath = os.path.join("in", "Doyle_Hound_1902.txt")
with open(filepath, "r") as f:
    txt = f.read()

In [5]:
# Preview
txt[:500]

'\ufeffArthur Conan Doyle\n\nThe Hound of the Baskervilles\n\nChapter 1\n\nMr. Sherlock Holmes \n\nMr. Sherlock Holmes, who was usually very late in the mornings, save upon those not infrequent occasions when he was up all night, was seated at the breakfast table. I stood upon the hearth-rug and picked up the stick which our visitor had left behind him the night before. It was a fine, thick piece of wood, bulbous-headed, of the sort which is known as a "Penang law- yer." Just under the head was a broad silver'

Write function

In [7]:
def load_txt(data):
    # Loading a text from data folder - get a new text - this one is too long
    filepath = os.path.join("in", data)
    with open(filepath, "r") as f:
        txt = f.read()
    
    return txt

In [12]:
txt = load_txt("Doyle_Hound_1902.txt")

In [13]:
# Preview
txt[:500]

'\ufeffArthur Conan Doyle\n\nThe Hound of the Baskervilles\n\nChapter 1\n\nMr. Sherlock Holmes \n\nMr. Sherlock Holmes, who was usually very late in the mornings, save upon those not infrequent occasions when he was up all night, was seated at the breakfast table. I stood upon the hearth-rug and picked up the stick which our visitor had left behind him the night before. It was a fine, thick piece of wood, bulbous-headed, of the sort which is known as a "Penang law- yer." Just under the head was a broad silver'

## Normalization

Remove punctuation

In [14]:
no_punct = re.sub("[^\w\s]", '', txt)
no_numbers = re.sub("\d", '', no_punct)
no_newline = re.sub("\s+", ' ', no_numbers) 

Spacy

ValueError: [E088] Text of length 1002050 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [15]:
doc = nlp(no_newline)

Write function

In [19]:
def normalize(txt):
    # Remove punctuation
    no_punct = re.sub("[^\w\s]", '', txt)
    # Remove numbers
    no_numbers = re.sub("\d", '', no_punct)
    # Remove newline
    no_newline = re.sub("\s+", ' ', no_numbers) 
    
    return no_newline

In [20]:
no_newline = normalize(txt)

In [21]:
# Preview
no_newline[:500]

'Arthur Conan Doyle The Hound of the Baskervilles Chapter Mr Sherlock Holmes Mr Sherlock Holmes who was usually very late in the mornings save upon those not infrequent occasions when he was up all night was seated at the breakfast table I stood upon the hearthrug and picked up the stick which our visitor had left behind him the night before It was a fine thick piece of wood bulbousheaded of the sort which is known as a Penang law yer Just under the head was a broad silver band nearly an inch acr'

## Search Keyword

In [27]:
# Printing the keyword
keyword = "murder"

search_result = []
for token in doc: 
    if token.lemma_ == keyword:
        search_result.append((token.i, token.lower_, token.pos_, token.lemma_))
    else:
        pass
print(search_result)

[(44743, 'murder', 'NOUN', 'murder'), (45662, 'murdered', 'VERB', 'murder'), (50328, 'murder', 'NOUN', 'murder'), (51087, 'murders', 'NOUN', 'murder'), (55584, 'murder', 'NOUN', 'murder'), (58681, 'murder', 'NOUN', 'murder'), (58950, 'murder', 'VERB', 'murder')]


Write function

In [24]:
def search_keyword(doc, keyword):
    # Define empthy list 
    search_result = []
    # Iterate through token in doc
    for token in doc:
        # find keyword in text
        if token.lower_ == keyword:
            search_result.append((token.i, token.lower_, token.pos_, token.lemma_))
        else:
            pass
        
    return search_result

In [25]:
result = search_keyword(doc, "murder")

In [26]:
result

[(44743, 'murder', 'NOUN', 'murder'),
 (50328, 'murder', 'NOUN', 'murder'),
 (55584, 'murder', 'NOUN', 'murder'),
 (58681, 'murder', 'NOUN', 'murder'),
 (58950, 'murder', 'VERB', 'murder')]

## Find Context Words

In [37]:
# Printing context words
window = 5

context_words = []
for token in doc: 
    if token.lower_ == keyword:
        # Context words before keyword
        for before_word in doc[token.i-(window):token.i]:
            context_words.append((before_word.i, before_word.text, before_word.pos_, before_word.lemma_))
        # Context words after keyword    
        for after_word in doc[token.i+1:token.i+window+1]:
            context_words.append((after_word.i, after_word.text, after_word.pos_, after_word.lemma_))
    else:
        pass
print(context_words)          

[(44738, 'as', 'SCONJ', 'as'), (44739, 'he', 'PRON', 'he'), (44740, 'answered', 'VERB', 'answer'), (44741, 'It', 'PRON', 'it'), (44742, 'is', 'AUX', 'be'), (44744, 'Watson', 'PROPN', 'Watson'), (44745, 'refined', 'VERB', 'refine'), (44746, 'coldblooded', 'VERB', 'coldbloode'), (44747, 'deliberate', 'ADJ', 'deliberate'), (44748, 'mur', 'ADJ', 'mur'), (50323, 'this', 'DET', 'this'), (50324, 'case', 'NOUN', 'case'), (50325, 'as', 'ADP', 'as'), (50326, 'one', 'NUM', 'one'), (50327, 'of', 'ADP', 'of'), (50329, 'and', 'CCONJ', 'and'), (50330, 'the', 'DET', 'the'), (50331, 'evidence', 'NOUN', 'evidence'), (50332, 'may', 'AUX', 'may'), (50333, 'implicate', 'VERB', 'implicate'), (55579, 'Montpensier', 'PROPN', 'Montpensier'), (55580, 'from', 'ADP', 'from'), (55581, 'the', 'DET', 'the'), (55582, 'charge', 'NOUN', 'charge'), (55583, 'of', 'ADP', 'of'), (55585, 'which', 'PRON', 'which'), (55586, 'hung', 'VERB', 'hang'), (55587, 'over', 'ADP', 'over'), (55588, 'her', 'PRON', 'she'), (55589, 'in', '

Write function

In [54]:
def find_context(doc, keyword, window): 
    # Define empthy list
    context_words = []
    
    # Iterate through tokens in doc
    for token in doc: 
        if token.lower_ == keyword:
            # Context words before keyword
            for before_word in doc[token.i-(window):token.i]:
                context_words.append((before_word.i, before_word.lower_))
            # Context words after keyword    
            for after_word in doc[token.i+1:token.i+window+1]:
                context_words.append((after_word.i, after_word.lower_))
        else:
            pass
        
    return context_words

In [55]:
context_words = find_context(doc, "murder", 5)

## Calculate Mutual Information score

In [43]:
# Creating a list of word lemmas
lemma_list = []

for token in doc: 
    lemma_list.append(token.lemma_)

print(lemma_list[0:10])

['Arthur', 'Conan', 'Doyle', 'the', 'Hound', 'of', 'the', 'Baskervilles', 'chapter', 'Mr']


In [44]:
# Import math package
import math

# Create list of context words
context_words_lemma = [sublist[3] for sublist in context_words]

# Define variables in the MI formula that does not need a for-loop
A = lemma_list.count(keyword)
size_corpus = len(lemma_list)
span = window*2

# Defining empty list for output
collocate_info = []

for word in context_words_lemma:
    B = lemma_list.count(word)
    AB = context_words_lemma.count(word)
    MI = math.log10( (AB*size_corpus) / (A * B * span) ) / math.log10(2)
    collocate_info.append((word, B, AB, MI))

print(collocate_info)

[('as', 405, 2, 2.072012007292924), ('he', 1232, 1, -0.5329964356299903), ('answer', 33, 1, 4.689395985706458), ('it', 980, 1, -0.20284783393765946), ('be', 2506, 2, -0.5573805942569495), ('Watson', 112, 1, 2.9264351830073068), ('refine', 1, 1, 9.73379010506491), ('coldbloode', 1, 1, 9.73379010506491), ('deliberate', 1, 1, 9.73379010506491), ('mur', 1, 1, 9.73379010506491), ('this', 337, 1, 1.3371853238830524), ('case', 78, 1, 3.448387886202663), ('as', 405, 2, 2.072012007292924), ('one', 248, 1, 1.7795937946780358), ('of', 1593, 2, 0.09625955353960121), ('and', 1627, 1, -0.9342084306076166), ('the', 3329, 3, -0.38212054935419587), ('evidence', 13, 1, 6.033350386923819), ('may', 99, 1, 3.104433484985301), ('implicate', 2, 1, 8.73379010506491), ('Montpensier', 1, 1, 9.73379010506491), ('from', 286, 1, 1.5739187682865217), ('the', 3329, 3, -0.38212054935419587), ('charge', 7, 1, 6.926435183007307), ('of', 1593, 2, 0.09625955353960121), ('which', 416, 1, 1.033350386923819), ('hang', 11, 1

Write function

In [50]:
def get_MI(window, keyword, context_words):
    # Create list of context words
    context_word_list = [sublist[1] for sublist in context_words]
    
    # Define empthy list
    word_list = []
    # Create list of words in corpus to count word frequency
    for token in doc: 
        word_list.append(token.lower_)

    # Define variables in the MI formula that does not need a for-loop
    A = word_list.count(keyword)
    size_corpus = len(word_list)
    span = window*2

    # Defining empty list for output
    collocate_info = []

    # Get mutual informaition scores
    for word in context_word_list:
        # Define variables in the MI formula that need a for-loop
        B = word_list.count(word)
        AB = context_word_list.count(word)
        # Calculate MI score
        MI = math.log10( (AB*size_corpus) / (A * B * span) ) / math.log10(2)
        # Save informatio to list 
        collocate_info.append((word, B, AB, MI))

    return collocate_info

In [56]:
collocate_info = get_MI(5, "murder", context_words)

## Save to CSV

In [59]:
# Import pandas
# import pandas as pd
# create a dataframe
# collocate_df = pd.DataFrame(collocate_info, columns=["collocate_term", "app_in_text", "app_in_context", "MI"])

In [57]:
# Saving CSV
# outpath = os.path.join("Output", "collocate_df.csv")
# collocate_df.to_csv(outpath, index=False)

Write function 

In [58]:
def save_data(collocate_info):
    # create a dataframe
    collocate_df = pd.DataFrame(collocate_info, columns=["collocate_term", "app_in_text", "app_in_context", "MI"])
    # Saving CSV
    outpath = os.path.join("output", "collocate_df.csv")
    collocate_df.to_csv(outpath, index=False)

In [59]:
save_data(collocate_info)