In [11]:
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import shutil

In [5]:
zip_filename = "/content/archive (3).zip"

extract_dir = "/content/"

os.makedirs(extract_dir, exist_ok=True)

shutil.unpack_archive(zip_filename, extract_dir)


In [8]:
story_path = "/content/sherlock/sherlock"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(os.path.join(story_path,file)) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------':
                      break
                    if line!='':
                      txt.append(line)
    return txt

stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


In [12]:
nltk.download('punkt')
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


number of words =  2332247


In [13]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1

    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [14]:
markov_model = make_markov_model(cleaned_stories)
print("number of states = ", len(markov_model.keys()))
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

number of states =  208716
All possible transitions from 'the game' state: 

{'was whist': 0.036036036036036036, 'is afoot': 0.036036036036036036, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'your letter': 0.02702702702702703, 'i am': 0.02702702702702703, 'no

In [15]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story


In [16]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes i ejaculated well really he cried holmes smiled at the upper end until they came in 
1.  dear holmes what do you mean the little hampshire station we secured a ramshackle trap and in a 
2.  dear holmes i thought that i am arrested it may have nothing to do with dogs when i 
3.  dear holmes if i can not help thinking that your hat mr baker yes sir that sir charles 
4.  dear holmes i fear that you step right out of there with his blood upon my mind and 
5.  dear holmes what do you propose to investigate that let me tell me everything then said he by 
6.  dear holmes i exclaimed and then he cried with a most amazing power of sustained vindictiveness which he 
7.  dear holmes i exclaimed oh the mystery he answered coming back with a better time my dear watson 
8.  dear holmes he has not climbed nothing would annoy brother bartholomew more than any of those letters the 
9.  dear holmes i fear that it was a scourge which inflicted the injuries his circle of light thrown 
10. 