In [2]:
import re
from collections import Counter
import numpy as np
import pandas as pd
import w1_unittest

In [3]:
def process_data(file_name):
    """
    Input: 
        A file_name which is found in your current directory. You just have to read it in. 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] # return this variable correctly

    
    #Open the file, read its contents into a string variable
    with open(file_name) as f:
        file_name_data = f.read()
    # convert all letters to lower case
    file_name_data = file_name_data.lower()
    #Convert every word to lower case and return them in a list.
    words = re.findall('\w+', file_name_data)
    
    return words

In [7]:
word_l = process_data('./data/shakespeare.txt')
vocab = set(word_l)  # this will be your new vocabulary
print(f"The first ten words in the text are: \n{word_l[0:10]}")
print(f"There are {len(vocab)} unique words in the vocabulary.")

The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.


In [8]:
w1_unittest.test_process_data(process_data)

[92m All tests passed


In [9]:
def get_count(word_l):
    '''
    Input:
        word_l: a set of words representing the corpus. 
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    '''
    
    word_count_dict = {}  # fill this with word counts
   
    word_count_dict = Counter(word_l)      
    return word_count_dict

In [10]:
word_count_dict = get_count(word_l)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'thee' is {word_count_dict.get('thee',0)}")

There are 6116 key values pairs
The count for the word 'thee' is 240


In [11]:
w1_unittest.test_get_count(get_count, word_l)

[92m All tests passed


In [12]:
def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly
    
    
    
    # get the total count of words for all words in the dictionary
    m = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict[key] / m
    
    return probs

In [13]:
probs = get_probs(word_count_dict)
print(f"Length of probs is {len(probs)}")
print(f"P('thee') is {probs['thee']:.4f}")

Length of probs is 6116
P('thee') is 0.0045


In [14]:
w1_unittest.test_get_probs(get_probs, word_count_dict)

[92m All tests passed
