### Import the Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Anthony
[nltk_data]     Lam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data Collection: Reading the Dostoevsky Novels

In [2]:
story_path = "./Dostoevsky/"

def read_all_chapters(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file, "r", encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line != '':
                        txt.append(line)

    return txt

stories = read_all_chapters(story_path)
print("Number of lines = ", len(stories))

Number of lines =  869


# Data Cleaning/Wrangling: Cleaning the Texts

In [3]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt += words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("Number of words = ", len(cleaned_stories))

Number of words =  63365


# Model Building: Creating the Markov Model

In [4]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state = ""
        next_state = ""
        for j in range(0, n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
            
    return markov_model

markov_model = make_markov_model(cleaned_stories)
print("Number of states = ", len(markov_model.keys()))

Number of states =  32760


# Production: Generating a Dostoevsky Story!

In [5]:
def generate_story(markov_model, limit=100, start='i am'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story += curr_state + " "
    while n < limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        curr_state = next_state[0]
        story += curr_state + " "
        n+=1
    return story

print(generate_story(markov_model, limit=500))

i am the more persuaded of that suspicion if one must live one had better live in a minute you do not forsake me because i am an old schoolfellow of his success as a lieutenant and of course the devil only knows what habit can do with your wishes and whether you go as early as possible morning and gave it to me that you are sad and dreary i have been in the street it was just as any coarse peasant but as a whole with everything that he is here and everything had been taken by another already when nothing was mine when in fact not a if you catch anything you may have sincerity but you are evidently well satisfied with their husbands just because they have received the letter have you been waiting long trudolyubov inquired i arrived at nine oclock in the nevsky went to the point of bliss that i should bear you a grudge nastenka that you almost loved me only that he seems like that well if you havent why do you suppose he really loves you cherishes you never leaves you there is happiness 