## Week 9: Defining Functions and Moving Windows

In [None]:
import re
# Function definitions start with 'def' followed by the name of the function
# and the list of arguments. The body of the function is indented
# You should write a comment that explains what the function does and give an
# example of how to use it and the expected return value

def clean_tokens(text):
    ''' Return a list of words extracted from text where punctuation has been removed and all characters have been converted to lower case.
    Example:
    >>> clean_tokens("How's it going?")
    ['how', 's', 'it', 'going']
    '''
    cleaned_text = text.lower()
    cleaned_text = re.sub("[^a-zA-Z0-9]", " ", cleaned_text)
    tokens = cleaned_text.split()
    return tokens


In [None]:
# Now let's try calling our new function
clean_tokens("How's it going")

## Moving windows

In [None]:
print(clean_tokens("One fish, Two fish, Red fish, Blue fish,"))

Let's write a function to compute the moving window TTR where the window size is an argument.  This will allow us to experiment with different window sizes.

In [None]:
def moving_TTR(tokens, window_size):
    """ Return the average TTR for tokens given a window size of window_size
        If window_size is greater than the length of tokens return 0
    """
    if window_size > len(tokens):
        return 0
    
    # The starting index for a particular window
    window_start = 0
    types_sum = 0

    # loop over each window by incrementing window_start
    while window_start <= len(tokens) - window_size:

        # comput the types for this window
        unique_words = []
        for index in range(window_start, window_start + window_size):
            if tokens[index] not in unique_words:
                unique_words.append(tokens[index])

        # add the number of types for this window to our sum
        types_sum = types_sum + len(unique_words)
        window_start += 1

    # compute the average
    num_windows = len(tokens) - window_size + 1
    average_ttr = types_sum / (num_windows * window_size) * 100
    return average_ttr

moving_TTR(['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'], 3)

Yikes!  When I tried to run this on the csal texts, my estimate was that it would take about 16 hours to complete the task.

There is a more efficient way that we won't cover in class, but you could talk to TAs or me about this approach.

In [None]:
from collections import Counter

def moving_TTR(tokens, window_size):
    n = len(tokens)
    if window_size <= 0 or window_size > n:
        return 0

    # initialize first window
    window = tokens[:window_size]
    counts = Counter(window)            # word -> frequency in current window
    # counts is a dictionary that maps each distint work in window to the number of times that word appears

    num_types = len(counts)              # number of types in current window
    types_sum = num_types

    # slide the window once per step
    for i in range(window_size, n):
        out_tok = tokens[i - window_size]
        in_tok  = tokens[i]

        # remove leftmost token
        counts[out_tok] -= 1
        if counts[out_tok] == 0:
            del counts[out_tok]
            num_types -= 1

        # add rightmost token
        prev = counts.get(in_tok, 0)
        counts[in_tok] = prev + 1
        if prev == 0:
            num_types += 1

        types_sum += num_types

    num_windows = n - window_size + 1
    return types_sum / (num_windows * window_size) * 100

moving_TTR(['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'], 8)

In [None]:
import re
from pathlib import Path

folder_path = "csal/" # We're telling the code to look in the "csal/" subfolder, where the CSAL files all live.

# Open the output file and write the headers
file = open("ttr-windows.csv", mode="w", encoding="utf-8")

file.write('"Text","Total Tokens","TTR 500","TTR 1000","TTR 2000","TTR 4051"\n') 
windows = [500, 1000, 2000, 4051]

for file_path in sorted(Path(folder_path).glob('*.txt')):
    text = open(file_path, encoding='utf-8').read()
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    text = text.lower()
    
    text_words = text.split()
    # write the start of this line to the file
    file.write(f'"{file_path.name}", {len(text_words)}')

    # compute and write each ttr to the file
    for window_size in windows:
        print(f"Computing window size {window_size} for {file_path.name}")
        ttr = moving_TTR(text_words, window_size)
        file.write(f',{ttr:.2f}')
    file.write('\n')

file.close()

## Research question

There is one book where the TTR average over the moving window is very different than standardized TTR.  Let's investigate why.

In [None]:
from collections import Counter

book = open("csal/M-Frere-Old-Deccan-Days-or-Hindoo-Fariy-Legends-Current-in-Southern-India-1870-Fiction.txt", encoding='utf-8').read()
book = re.sub("[^a-zA-Z0-9]", " ", book)
book = book.lower()
tokens = book.split()
window_size = 4051
ttr_list = []
 
# The starting index for a particular window
window_start = 0

# initialize first window
window = tokens[:window_size]
counts = Counter(window)            # word -> frequency in current window
# counts is a dictionary that maps each distint work in window to the number of times that word appears

num_types = len(counts)              # number of types in current window
ttr_list.append(num_types/window_size *100)
types_sum = num_types
n = len(tokens)

# slide the window once per step
for i in range(window_size, n):
    out_tok = tokens[i - window_size]
    in_tok  = tokens[i]

    # remove leftmost token
    counts[out_tok] -= 1
    if counts[out_tok] == 0:
        del counts[out_tok]
        num_types -= 1

    # add rightmost token
    prev = counts.get(in_tok, 0)
    counts[in_tok] = prev + 1
    if prev == 0:
        num_types += 1
    if i % 100 == 0:
        ttr_list.append(num_types/window_size *100)
    types_sum += num_types

num_windows = n - window_size + 1
average_ttr = types_sum / (num_windows * window_size) * 100
print(ttr_list)
  


In [None]:
import matplotlib.pyplot as plt
plt.plot(ttr_list)
plt.title("Moving Window TTR")
plt.xlabel("Index")
plt.ylabel("TTR")

plt.axhline(average_ttr, color='red', linestyle='--', label=f'Average = {average_ttr:.2f}')

plt.show()