In [1]:
import string
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Read in Data

In [2]:
with open('data/training/training-data.1m') as f:
    train_data = [x.strip('\n') for x in f.readlines()]

In [3]:
len(train_data)


1000000

In [4]:
train_data[0]

'The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .'

# Pre-processing

In [5]:
def prep(data):
    return [x.translate(None, string.punctuation).lower() for x in data]

In [6]:
train_data = prep(train_data)

In [7]:
train_data[0]

'the us centers for disease control and prevention initially advised school systems to close if outbreaks occurred  then reversed itself  saying the apparent mildness of the virus meant most schools and day care centers should stay open  even if they had confirmed cases of swine flu '

# Build Word And Context Dictionary
Every word in the vocabulary as a key with the count of appearances as the value.
For now, the Context Dictionary is going to be the same as the Word Dictionary

In [8]:
def create_dict(data):
    dictionary = {}
    for line in data:
        for word in line.split():
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    return dictionary

In [9]:
dictionary_counts = create_dict(train_data)

In [10]:
len(dictionary_counts) # vocab size = 299293

299293

## Prune Word Dictionary

In [11]:
# Create a new, shorter dictionary, where words that appear less often than a given threshold will be removed. 
# Instead we have an UNK
# Instead of storing counts, we store an index/ sequence
def prune_dict(dictionary, threshold):
    pruned_dict = {'UNK':0}
    count = 1
    for word in dictionary:
        if dictionary[word] >= threshold:
            pruned_dict[word] = count
            count +=1
    return pruned_dict
    
# To Store counts:
#
#     pruned_dict = {'UNK':0}
#     for word in dictionary:
#         if dictionary[word] < threshold:
#             pruned_dict['UNK'] +=1
#         else:
#             pruned_dict[word] = dictionary[word]
#     return pruned_dict

In [12]:
def get_avg_count(dictionary):
    return sum(dictionary[word] for word in dictionary) / float(len(dictionary))

In [13]:
get_avg_count(dictionary_counts)

75.05992455553589

In [14]:
word_pruned_dictionary = prune_dict(dictionary_counts, get_avg_count(dictionary_counts))

In [None]:
# Temporarily until we get the other version of the pruning.
context_pruned_dictionary = word_pruned_dictionary

In [60]:
pruned_dictionary

{'writings': 4881,
 'yellow': 1028,
 'four': 3117,
 'prices': 2039,
 'woods': 1,
 'commented': 9836,
 'insiders': 4079,
 'woody': 2,
 'cyprus': 2040,
 'gabrielle': 1029,
 'papandreou': 9834,
 'increase': 10191,
 'granting': 3032,
 'eligible': 6769,
 'electricity': 4882,
 'unanswered': 4883,
 'list': 4770,
 'lord': 2053,
 'meadows': 5825,
 'sinking': 2043,
 'hormone': 10897,
 'hacked': 11729,
 'regional': 1030,
 'dell': 12807,
 'foul': 2038,
 'taj': 15627,
 'fur': 2044,
 'stabbed': 5826,
 'bringing': 3,
 'wooded': 4,
 'basics': 2045,
 'internally': 3034,
 'grueling': 5,
 'andre': 3516,
 'wooden': 6,
 'wednesday': 7,
 'coventry': 15334,
 'delivered': 10288,
 'jihad': 6768,
 'succession': 13766,
 'straight': 6720,
 'charter': 12808,
 'specially': 4167,
 'tired': 3035,
 'miller': 5849,
 'hanging': 7718,
 'bacon': 10899,
 'frederick': 8037,
 'pulse': 3036,
 'budget': 7893,
 '270': 8,
 'elegant': 2069,
 'second': 10900,
 'summer': 10557,
 '275': 9,
 'firstquarter': 5828,
 'sustaining': 11901

In [61]:
len(pruned_dictionary)

15676

# Pickle Dictionaries

In [1]:
import cPickle

In [None]:
# # Pickle:
# f = open('dictionary_pickle.p', 'wb')   # 'wb' instead 'w' for binary file
# pickle.dump(dictionary, f, -1)       # -1 specifies highest binary protocol
# f.close() 

In [None]:
# # Unpickle:
# f = open('dictionary_pickle.p', 'rb')   # 'rb' for reading binary file
# dictionary = pickle.load(f)     
# f.close() 