In [None]:
import spacy
import seaborn as sns

from spacy.vocab import Vocab
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

sns.set_style("darkgrid")

# Warmup: Computing word frequencies

In [None]:
def count_word_frequencies(YOUR_FILE, ENCODING):

    freqs = Counter()
    with open(YOUR_FILE, encoding = ENCODING) as f:
        for line in f:
            tokens = line.lower().split()
            for token in tokens:
                freqs[token] += 1
                
    return freqs

In [None]:
def count_word_pairs(YOUR_FILE, ENCODING):
    
    freqs = defaultdict(Counter)
    with open(YOUR_FILE, encoding = ENCODING) as f:
        for line in f:
            tokens = line.lower().split()
            for t1, t2 in zip(tokens, tokens[1:]):
                freqs[t1][t2] += 1
    
    return freqs

In [None]:
wiki_freqs = count_word_frequencies("a1_data/wikipedia.txt", 'utf-8')
wiki_freqs.most_common()[0:10]

In [None]:
euro_freqs = count_word_frequencies("a1_data/europarl.txt", 'utf-8')
euro_freqs.most_common()[0:10]

In [None]:
book_freqs = count_word_frequencies("a1_data/books.txt", 'ISO-8859-1')
book_freqs.most_common()[0:10]

In [None]:
wiki_pairs = count_word_pairs("a1_data/wikipedia.txt", 'utf-8')
wiki_pairs["red"].most_common()[1:10]

In [None]:
euro_pairs = count_word_pairs("a1_data/europarl.txt", 'utf-8')
euro_pairs["red"].most_common()[1:10]

In [None]:
book_pairs = count_word_pairs("a1_data/books.txt", 'ISO-8859-1')
book_pairs["red"].most_common()[1:10]

# Investigating the word frequency distribution 

In [None]:
def plot_rank_frequency(freqs, plot_length):

    fig_size = (15, 5)
    font_size = 15
    x_vec = []
    y_vec = []
    
    common_words = freqs.most_common()[0:plot_length+1]

    for i in range(plot_length):
        x_vec.append(i)
        y_vec.append(common_words[i][1])

    f,(ax1, ax2) = plt.subplots(1, 2, figsize= fig_size)
    ax1.plot(x_vec, y_vec)
    ax1.set_xlabel("Rank of the word")
    ax1.set_ylabel("Frequency of the word")
    ax1.set_title("Rank/frequency plot of the " + str(plot_length) + " most common words.", fontsize = font_size)

    ax2.loglog(x_vec, y_vec)
    ax2.set_xlabel("Rank of the word")
    ax2.set_ylabel("Frequency of the word")
    ax2.set_title("Rank/frequency plot, in log-log scale.", fontsize = font_size)

In [None]:
plot_rank_frequency(book_freqs, 100)

In [None]:
plot_rank_frequency(wiki_freqs, 100)

In [None]:
plot_rank_frequency(euro_freqs, 100)

## Reflection

The rank/frequency plots show for each corpus shows that a few short words occur very frequently, but that the frequency decreases quickly for less common words. This uneven distribution should make it harder to train machine learning models on language data - key words occur rarely in training data, but are very important for the model's understanding. 

Zipf's law states that for a given corpus, the frequency of any word is inversely proportional to its rank in the frequency table. Thus, on a log-log scale we expect the rank/frequency plot to take the form of a straigt line (with a negative slope). This appears to be a good fit for the Wikipedia corpus. For both the European Parlaiment and the Book Reviews corpora, the rank/frequency plot has a smaller slope initially but follows Zipf's law well for higer rank words. 

# Comparing corpora

In [None]:
# What words are "typical" of the European Parliament corpus when we compare it to the book review corpus, 
# or vice versa? You will have to come up with your own operationalization of the notion of "typical" here.

# My idea: find 50 most common words for each corpus
# check which words are only on the top 50 list for one of the corpora

compare_length = 50

euro_common = euro_freqs.most_common()[0:compare_length+1]
book_common = book_freqs.most_common()[0:compare_length+1]

euro_remove = []
book_remove = []
common_words = []

# Make a list of words that occur in both lists
for element1 in book_common:
    word1 = element1[0]
    for element2 in euro_common:
        word2 = element2[0]
        if word1 == word2:
            common_words.append(word1)
    
# Make a list of what to remove from book list    
for i in range(len(book_common)):
    element = book_common[i][0]
    if element in common_words:
        book_remove.append(book_common[i])

# Make a list of what to remove from euro list         
for i in range(len(euro_common)):
    element = euro_common[i][0]
    if element in common_words:
        euro_remove.append(euro_common[i])      
        
# Remove common words from book list
for element in book_remove:
    book_common.remove(element)

# Remove common words from euro list    
for element in euro_remove:
    euro_common.remove(element)

In [None]:
# These words are typical for the European Parlaiment corpus, compared to the Book Review corpus:
euro_common

In [None]:
# These words are typical for the Book Review corpus, compared to the European Parlaiment corpus:
book_common

# Preprocessing text for machine learning

In [None]:
max_voc_size = 1000
word_freqs = wiki_freqs
word_list = []

if len(word_freqs.most_common()) > max_voc_size:
    vocab = word_freqs.most_common()[0:max_voc_size]
    
else:
    vocab = word_freqs
    
for i in range(len(vocab)):
    word_list.append(vocab[i][0])

# Get pairs of elements    
tmp = zip(word_list, range(1,max_voc_size+1))
# Make pairs into a dictionary
vocab = dict(tmp)   
vocab2 = defaultdict(int)
vocab2.update(vocab)

In [None]:
vocab2['the']

In [2]:
#with open("a1_data/wikipedia.txt", encoding = 'utf-8') as f:
    #for line in f:
        #print(line)
        #tokens = line.lower().split()
        #for token in tokens:
        #    freqs[token] += 1

#return freqs

count = len(open("a1_data/wikipedia.txt", encoding = 'utf-8').readlines())

In [3]:
count

2200000

In [None]:
#dataset = ... something ...

with open("a1_data/wikipedia.txt", encoding = 'utf-8') as f:
    voc = Vocab(f, prune_by_total=1000, batch_size=8)

    # go through the lines and build the vocabulary
    #voc.build_vocab(f)

with open("a1_data/wikipedia.txt", encoding = 'utf-8') as f:
    for b in voc.batches(f):
        # b is a matrix of shape (max_length, batch_size)
        # where max_length is the length of the longest
        # line in the batch
        print(b)

In [None]:
voc