In [1]:
import sys
import io
import os

import numpy as np
from random import randint
from random import random

from collections import *

import re

from os import listdir
from os.path import isfile, join

In [2]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[”“’‘'-()\"#/@;:<>{}`+=~|!?]", "", text)
    
    return text

In [3]:
### Read/process inputs
input_path = './raw_data/'
all_data_sents = []

files = [join(input_path, f) for f in listdir(input_path) if isfile(join(input_path, f))]
for filename in files:
    if filename.endswith('.txt'):
        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            text = text.replace('\n\n', '\n')\
                .replace('"', '')\
                .replace('-', ' - ')\
                .replace("\"", '')\
                .replace("...", "")\
                .replace("…", "")\
                .replace("—", "")\
                .replace(". ", " . ")\
                .replace(", ", " , ")
            text = ' '.join(text.split("\n"))
            all_data_sents.append(text)
    
print(len(all_data_sents))

all_data_string = ' '.join(all_data_sents)
print(len(all_data_string))

all_data_string = clean_text(all_data_string)
print(len(all_data_string))

5
9609400
9534701


In [4]:
TEST_SET_SIZE = 20

sents = [sent for sent in all_data_string.split('.') if sent.strip() != '' or sent == '\n']
train_set, test_set = sents, []

for x in range(TEST_SET_SIZE):
    index = randint(0, len(sents))
    test_set.append(train_set.pop(index).strip())
    
train_set = '.'.join(train_set)

In [5]:
def xrange(x):
    return iter(range(x))

def train_char_lm(data, order=4):
    lm = defaultdict(Counter)
    pad = "~" * order
    data = pad + data
    for i in xrange(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        lm[history][char]+=1
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm

In [6]:
def generate_letter(lm, history, order):
        history = history[-order:]
        dist = lm[history]
        x = random()
        for c,v in dist:
            x = x - v
            if x <= 0: return c

In [14]:
def generate_text(lm, order, history=None, nletters=1000):
    if history is None:
        history = "~" * order
    else:
        history = history[0:order]
        if len(history) < order:
            history = ("~" * (order - len(history))) + history
    out = []
    for i in xrange(nletters):
        c = generate_letter(lm, history, order)
        history = history[-order:] + c
        out.append(c)
    return "".join(out)

In [10]:
ORDER = 20

lm = train_char_lm(all_data_string, ORDER)

In [24]:
test_index = randint(0, TEST_SET_SIZE)
test_input = test_set[test_index]
print("TEST INPUT:\n{}\n".format(test_input))
print("GENERATED TEXT:\n")
print(generate_text(lm, ORDER, test_input))

TEST INPUT:
it was the best counsel he had for her , though he wished it sounded wiser

GENERATED TEXT:

sel he had for her , though he wished it sounded wiser. he had written to each of his three surviving sons as well , to help them remember the father who had sent him to the wall , insisting that he had to tell him something , but oft as not he would have forgotten what he meant to say by the time that sam arrived . even when he did recall , his talk was all a jumble . he spoke of dreams and never named the dreamer , of a glass candle that could not be helped , he told his nephew . our fleet was doomed in any case. even from atop the merlonhe had been too short to see over the ramparts , so hed had them boost him upthe flames and smoke and chaos of battle made it impossible for tyrion to see what was happening , riders were pouring over the ditch banks and galloping through the woods and picking flowers, cersei said . i have a kingdom to rule. only one , your grace who rules the oth

In [23]:
print(generate_text(lm, ORDER, 'another two thousand'))

. in a year i shall be in westeros , said dany when she had heard the translation . only then did she stop to shake the water from his coat and bare his teeth at the rain . when at last robb gave jeyne one final kiss , dispatched a dozen men to take her back to riverrun , and he was rid of the big homely wench as well . i could not have named them all myself . that was a lie , but if joffrey is the lawful king and robb a traitor . among the brotherhood of the failed and the fallen , the disgraced and the disinherited . this is my army . this is our best hope. he turned to harry strickland. homeless harry looked little like a warrior . portly , with a big round head , mild grey eyes , and thinning hair that he brushed sideways to conceal a bald spot , strickland sat in a camp chair soaking his feet in a tub of salt water . you will pardon me if i do not weep for shae . i confess , i do not understand . words he had never meant to speak came tumbling out of her . yes . i will . i would l