String processing with Python

Using a text corpus found on the cds-language GitHub repo or a corpus of your own found on a site such as Kaggle, write a Python script which calculates collocates for a specific keyword.

The script should take a directory of text files, a keyword, and a window size (number of words) as input parameters, and an output file called out/{filename}.csv
These parameters can be defined in the script itself
Find out how often each word collocates with the target across the corpus
Use this to calculate mutual information between the target word and all collocates across the corpus
Save result as a single file consisting of four columns: collocate, raw_frequency, MI


BONUS CHALLENGE: Use argparse to take inputs from the command line as parameters

__Importing libraries__

In [36]:
import os
import sys 
sys.path.append(os.path.join("..")) # enabling communication with home directory
import pandas as pd 
from pathlib import Path
import csv 
import re
import string
import numpy as np

__Defining tokenizer function__

In [34]:
def tokenize(input_string):
    # Split at all characters except for letters (both lowercase and uppercase) and apostrophes
    tokenizer = re.compile(r"[^a-zA-Z']+") 
    # Tokenize
    token_list = tokenizer.split(input_string) # return a token list by splitting the input string using the compiling pattern
    # Return list of tokens
    token_list.remove("")
    return token_list

In [35]:
tokenize("he wasn't happy, but would never forget. test test.")

['he', "wasn't", 'happy', 'but', 'would', 'never', 'forget', 'test', 'test']

__Defining collocate function__

In [37]:
def collocates(path, keyword, window_size):
    
    token_list_all = []
    collocates_list = []
    data = pd.DataFrame(columns=["keyword", "collocate", "raw_frequency", "MI"])
    u = 0
    
    for filename in Path(path).glob("*.txt"):
        with open (filename, "r", encoding = "utf-8") as file:
            text = file.read()
            token_list = tokenize(text.lower())
            token_list_all.extend(token_list)
            indices = [index for index, x in enumerate(token_list) if x == keyword]
            u = u + len(indices)
            
#def collocates_list(index_list, token_list, window_size):
            for index in indices:
                window_start = max(0, index - window_size)
                window_end = index + window_size
                keyword_string = token_list[window_start : window_end + 1]
                collocates_list.extend(keyword_string)
                collocates_list.remove(keyword)
                
    # Now we are going to calculate collocate frequency         
    unique_collocates = set(collocates_list)
    for collocate in unique_collocates:
        v = token_list_all.count(collocate)
        raw_frequency = v/len(token_list_all)
        O11 = collocates_list.count(collocate)
        O12 = u - O11
        O21 = v - O11
        R1 = O11 + O12
        C1 = O11 + O21
        N = len(token_list_all)
        E11 = R1*C1/N
        MI = np.log(O11/E11)
        data = data.append({"keyword": keyword, 
                     "collocate": collocate, 
                     "raw_frequency": raw_frequency,
                     "MI": MI}, ignore_index = True)
        
    data = data.sort_values("MI", ascending = False)    
    return data

O11 = u & v = in lines from KWIC, how often do we have the collocate in it  
O12 = u & !v = total number of u’s - O11 <br>
O21 = !u & v = total number of v’s - O11 <br>
R1 = O11 + O12 <br>
C1 = O11 + O21

In [38]:
path = os.path.join("..", "data", "100_english_novels", "test_corpus")
collocates_df = collocates(path, "he", 2)
collocates_df.to_csv("Collocates.csv", index = False)

In [39]:
collocates_df

Unnamed: 0,keyword,collocate,raw_frequency,MI
4358,he,prospered,0.000001,5.209240
3228,he,hope's,0.000001,5.209240
719,he,lisped,0.000001,5.209240
1357,he,relents,0.000001,5.209240
672,he,adores,0.000001,5.209240
...,...,...,...,...
1938,he,she,0.008047,-0.780724
2423,he,sweet,0.000269,-0.782225
1105,he,are,0.002154,-0.784722
3068,he,clear,0.000285,-0.840494
