In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

In [3]:
#Reading stories of sherlock holmes
story_path = "sherlock/sherlock/"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------': break
                    if line!='':txt.append(line)
    return txt

stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


In [4]:
#Cleaning text

def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332247


In [6]:
#Creating Markov Model

def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1

    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [7]:
markov_model = make_markov_model(cleaned_stories)

In [8]:
print("number of states = ", len(markov_model.keys()))

number of states =  208717


In [9]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'was whist': 0.036036036036036036, 'would have': 0.036036036036036036, 'in their': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'is afoot': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.02702702702702703, 'your letter': 0.027027027027027

In [10]:
#Genearting Sherlock Stories

def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [11]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes he has been here press him tell him where the features have been more adroit in 
1.  dear holmes my previous letters and poured them all into the fire and to be obeyed his manner 
2.  dear holmes said i he would naturally feel loyalty towards the officials one of the peculiarities of the 
3.  dear holmes you are fortunately the only man who can hate too i was driven over to ross 
4.  dear holmes i fear been negative one thing only is certain it was a debt of honor so 
5.  dear holmes what do you think that the evidence is that he had come upon the land heaven 
6.  dear holmes you are aware watson it is in the busiest thoroughfare we were helped through it by 
7.  dear holmes you are mistaken about my alleged agents count sylvius laughed contemptuously other people can observe as 
8.  dear holmes it is absurd to deny that i have not much hope of getting it that is 
9.  dear holmes what do you call it crime cried morris his voice testified to his jangled nerves he 
10.  d

In [12]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="my dear", limit=8))

0.  my dear watson said he i wonder since you are through with it i dont want to get 
1.  my dear watson i think we have done better to do a minute later the grim hall bristling 
2.  my dear fellow there is no reflection upon your professional knowledge said he for they had been that 
3.  my dear fellow be it so well he went a corroboration of that of the ladys eyes are 
4.  my dear boy children come said i then reflected that since he spoke about it he learned to 
5.  my dear watson said he there are points about the wards and even to you its up to 
6.  my dear watson you would certainly have suggested things so that it should not have troubled you for 
7.  my dear boy to apologise to him for the english papers on the sundial what have you to 
8.  my dear fellow it is at his post in devonshire i may be putting ourselves in the morning 
9.  my dear watson said he when we had left it there mr and mrs rucastle expressed a delight 
10.  my dear sir the person that professor presbury was r

In [14]:
print(generate_story(markov_model, start="the case", limit=100))

the case in that world of london so sir james walter and you had nothing particular on hand and that was his personal friend and hence also the were very much shocked by the rope i found him a prey to the lodge he stopped at a door leading to the lawn is thirty yards across and is intimately connected with the rest when she complained of a government appointment in the wet notches of the berkshire constabulary a smart little landau which rattled up to look at me with their searching glance the distance which brought me to the last from chicago a stranger in these parts but i was always empty as the secretary was employed up here who lowered you this is no alternative i suggested grotesquely improbably no doubt but still just conceivable might the whole thing was a episode intended to walk down together and in all ways a model employer he had lit his cigar and leaned back in his past life or else as a long man with loose limbs and body of police in she might say to you except that when 