In [1]:
import string
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import random

# Read in Data

In [2]:
with open('data/training/training-data.1m') as f:
    train_data = [x.strip('\n') for x in f.readlines()]

In [3]:
len(train_data)

1000000

In [4]:
train_data[0]

'The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .'

# Pre-processing

In [5]:
# remove punctuation and lower case all words
def prep(data):
    return [x.translate(None, string.punctuation).lower() for x in data]

In [6]:
train_data = prep(train_data)

In [7]:
train_data[0]

'the us centers for disease control and prevention initially advised school systems to close if outbreaks occurred  then reversed itself  saying the apparent mildness of the virus meant most schools and day care centers should stay open  even if they had confirmed cases of swine flu '

# Build Word Dictionary
Every word in the vocabulary as a key with the count of appearances as the value.

In [8]:
def create_dict(data):
    dictionary = {}
    for line in data:
        for word in line.split():
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    return dictionary

In [9]:
dictionary_counts = create_dict(train_data)

In [10]:
len(dictionary_counts) # vocab size = 299293

299293

## Prune Word Dictionary

In [11]:
# Remove words with frequency below a threshold.
def prune_dict(dictionary, threshold):
    pruned_dict = {}
    for word in dictionary:
        if dictionary[word] > threshold:
            pruned_dict[word] = dictionary[word]
    return pruned_dict

In [12]:
def get_avg_count(dictionary):
    return sum(dictionary[word] for word in dictionary) / float(len(dictionary))

In [13]:
# prune the dictionary with threshold = avg.
pruned_dictionary = prune_dict(dictionary_counts, get_avg_count(dictionary_counts))

In [14]:
len(pruned_dictionary) # 15,675

15675

In [15]:
del dictionary_counts

# Subsampling
Reduce words based on probability determined by their frequency - more likely to remove more common words. Similar to removing stop words

In [16]:
# returns true p percent of the time
def should_remove(p):
    return True if random.random() < p else False

# Remove words based on probablility determined by frequency
# The dictionary returned will hold an index/ sequence of the words
def subsample_dict(dictionary):
    t = 1e-5 # fixed threshold t 
    new_dict = {'UNK':0}
    count = 1
    total_counts = sum(dictionary[word] for word in dictionary)
    for word in dictionary:
        f = dictionary[word]/float(total_counts) # the word's fequency
        p = ((f-t)/f) - np.sqrt(t/f) # probability that word will get removed.
        if not should_remove(p):
            new_dict[word] = count # set it = dictionary[word] to store counts instead of indexes
            count +=1
    return new_dict


In [17]:
word_dictionary = subsample_dict(pruned_dictionary)

In [18]:
len(word_dictionary) #13,890

13888

In [19]:
context_dictionary = word_dictionary.copy()

# Pickle Dictionaries

In [20]:
import cPickle as pickle

In [21]:
# Pickle:
f = open('word-equal-context_subsampled.p', 'wb')   # 'wb' instead 'w' for binary file
pickle.dump({"word_pruned_dictionary" : word_dictionary,
             "context_pruned_dictionary": context_dictionary}, f, -1)       # -1 specifies highest binary protocol
f.close() 