In [1]:
import sys
import io
import os

import numpy as np
from random import randint
from random import random

from collections import *

import re

from os import listdir
from os.path import isfile, join

In [2]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[”“’‘'-()\"#/@;:<>{}`+=~|!?]", "", text)
    
    return text

In [3]:
### Read/process inputs
input_path = './raw_data/'
all_data_sents = []

files = [join(input_path, f) for f in listdir(input_path) if isfile(join(input_path, f))]
for filename in files:
    if filename.endswith('.txt'):
        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            text = text.replace('\n\n', '\n')\
                .replace('"', '')\
                .replace('-', ' - ')\
                .replace("\"", '')\
                .replace("...", "")\
                .replace("…", "")\
                .replace("—", "")\
                .replace(". ", " . ")\
                .replace(", ", " , ")
            text = ' '.join(text.split("\n"))
            all_data_sents.append(text)
    
print(len(all_data_sents))

all_data_string = ' '.join(all_data_sents)
print(len(all_data_string))

all_data_string = clean_text(all_data_string)
print(len(all_data_string))

5
9609400
9534701


In [4]:
# Prepare test set
TEST_SET_SIZE = 20

sents = [sent for sent in all_data_string.split('.') if sent.strip() != '' or sent == '\n']
train_set, test_set = sents, []

for x in range(TEST_SET_SIZE):
    index = randint(0, len(sents))
    test_set.append(train_set.pop(index).strip())
    
train_set = '.'.join(train_set)

In [5]:
def xrange(x):
    return iter(range(x))

def train_char_lm(data, order=4):
    lm = defaultdict(Counter)
    pad = "~" * order
    data = pad + data
    for i in xrange(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        lm[history][char]+=1
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm

In [6]:
def generate_letter(lm, history, order):
        history = history[-order:]
        dist = lm[history]
        x = random()
        for c,v in dist:
            x = x - v
            if x <= 0: return c

In [7]:
def generate_text(lm, order, history=None, nletters=1000):
    if history is None:
        history = "~" * order
    else:
        history = history[0:order]
        if len(history) < order:
            history = ("~" * (order - len(history))) + history
    out = []
    for i in xrange(nletters):
        c = generate_letter(lm, history, order)
        history = history[-order:] + c
        out.append(c)
    return "".join(out)

In [8]:
ORDER = 20

lm = train_char_lm(all_data_string, ORDER)

In [9]:
test_index = randint(0, TEST_SET_SIZE)
test_input = test_set[test_index]
print("TEST INPUT:\n{}\n".format(test_input))
print("GENERATED TEXT:\n")
print(generate_text(lm, ORDER, test_input))

TEST INPUT:
metal screamed on metal as the spearhead slid off the mountain is chest , slicing through the surcoat and leaving a long bright scratch on the steel beneath

GENERATED TEXT:

tal as the spearhead slid off the mountain is chest , slicing through the grey - green waters on billowing purple wings . arya could hear the rush of water and the creak of the mill is great wooden waterwheel . there was a smell of rain in the dawn air , but no drops were falling yet . flaming arrows flew through the morning mists , trailing pale ribbons of fire , and thudded into the wood , one two three , as he flung it down on top of his king . robin flint was ringed by freys , their daggers rising and falling in perfect time . ser rodrik held the rail and looked out over the half - mile swath of cleared land that lay between the wall and the edge of the forest they had raised their tents of hide and fur , even a crude longhall of logs and woven branches there were horselines to the east , mammoths 

In [11]:
print(generate_text(lm, ORDER, 'tyrion drank it in his window seat,'))

is window seat , where he sat drinking and watching the sea while the sun darkened over pyke . i have no place here , sam thought anxiously , when her red eyes fell upon him . someone had to help maester aemon up the steps . do not look at me , ever since that time i lost my horse . as if that could be helped . he was white and it was snowing , what did they expect the wind took that one , said grenn , another friend of lord snow is . try to hold the bow steady , sam. it is heavy , the fat boy complained , but he pulled the second arrow all the same . this one went high , sailing through the branches overhead , across the starry sky. snow, the moon murmured . the wolf made no answer . snow crunched beneath his feet . as  as you say , mlady. roose is not pleased . tell your bastard that. he is not my bastard , he wanted to say . who in seven hells do you think goes there did the others take your eyes he rode between the gateposts , one bearing a ram is skull and the other the skull of a