Steps
1. Load training corpus
2. Tokenize
   1. Normalize
3. Create n-grams
4. Count n-grams as history + continuation
5. Estimate probabilities
6. Sample text from model

In [3]:
import pandas as pd
import pathlib

# Read txt files in data/gutenberg and put into a df
def read_txt_files():
    # Get all txt files in data/gutenberg
    
    ################################################ 
    # Data folder path
    path = pathlib.Path("../data/gutenberg")
    ################################################
    
    txt_files = path.glob("*.txt")

    # Read each txt file and put into a df
    data = []
    for txt_file in txt_files:
        with open(txt_file, "r", encoding="utf-8") as f:
            lines = f.readlines()
            # Find title, author, and language
            title = None
            author = None
            language = None
            start = 0
            end = len(lines)
            for i, line in enumerate(lines[:100]):
                if "Title: " in line:
                    title = line.split("Title: ")[1].strip()
                if "Author: " in line:
                    author = line.split("Author: ")[1].strip()
                if "Language: " in line:
                    language = line.split("Language: ")[1].strip()
                
                if (line.__contains__("*** START OF THE PROJECT GUTENBERG EBOOK")):
                    start = i
                    break
                    
            for i, line in enumerate(lines[-1000:]):
                if (line.__contains__("*** END OF THE PROJECT GUTENBERG EBOOK")):
                    end = i
                    break
                
                
            content = "".join(lines[start+1:end-1])
            data.append({"title": title, "author": author, "language": language, "content": content})
    df = pd.DataFrame(data)
    df.set_index("title", inplace=True)
    return df

df = read_txt_files()

In [4]:
from nltk.tokenize import word_tokenize
import nltk.downloader
nltk.download("punkt")

# Tokenization
df["tokens"] = df["content"].apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rabjho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from nltk import ngrams
from collections import defaultdict, Counter

df["ngrams"] = df["tokens"].apply(lambda x: list(ngrams(x, 3)))

def create_ngram_model(tokens, n):
    ngram_model = defaultdict(Counter)
    for ngram in nltk.ngrams(tokens, n):
        history = ngram[:-1]
        continuation = ngram[-1]
        ngram_model[history][continuation] += 1
    return ngram_model

df["raw_model"] = df["tokens"].apply(lambda tokens: create_ngram_model(tokens, 3))

In [23]:
# Normalize probabilities by deviding history+continuation count by history count
def normalize(counter):
    total = float(sum(counter.values()))
    return [(key, val/total) for key, val in counter.items()]

df["model"] = df["raw_model"].apply(lambda x: {history: normalize(continuations) for history, continuations in x.items()})


In [166]:
import random

def generate_text(model, seed, n):
    history = tuple(seed)
    text = []
    for i in range(n):
        history = tuple(history)
        if history not in model:
            break
        possibilities = model[history]
        continuation = random.choices([x[0] for x in possibilities], [x[1] for x in possibilities])[0]
        text.append(continuation)
        history = list(history[1:]) + [continuation]
    return " ".join(seed+tuple(text))

document = 0
# Choose seed from likely ngrams in col raw_model
seed = random.choice(list(df["raw_model"][document].keys()))
generate_text(df["model"][document], seed, 100)


  seed = random.choice(list(df["raw_model"][document].keys()))
  generate_text(df["model"][document], seed, 100)


'go in sailor-hats instead . Every one takes them for English , Miss Honeychurch will be ready , ” his advice concluded . “ Don ’ t think much harm would have come of accepting. ” “ Mother wouldn ’ t you feel , too , to fling wide the windows , pinching the fingers in unfamiliar fastenings , to open the eyes upon a bright bare room , sighed heavily according to her cousin had permitted it . “ Charlotte , being poor. ” Fortunately one of them—one of the narrow world at Tunbridge Wells , she would be hard indeed'