**Using markov Chain + Nlp to generate Sherlock holmes stories**
Language model that generates text similar to given data set

In [17]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sherlock-holmes-stories/sherlock/blue.txt
/kaggle/input/sherlock-holmes-stories/sherlock/silv.txt
/kaggle/input/sherlock-holmes-stories/sherlock/prio.txt
/kaggle/input/sherlock-holmes-stories/sherlock/vall.txt
/kaggle/input/sherlock-holmes-stories/sherlock/norw.txt
/kaggle/input/sherlock-holmes-stories/sherlock/sign.txt
/kaggle/input/sherlock-holmes-stories/sherlock/bery.txt
/kaggle/input/sherlock-holmes-stories/sherlock/cnus.txt
/kaggle/input/sherlock-holmes-stories/sherlock/blac.txt
/kaggle/input/sherlock-holmes-stories/sherlock/engr.txt
/kaggle/input/sherlock-holmes-stories/sherlock/spec.txt
/kaggle/input/sherlock-holmes-stories/sherlock/last.txt
/kaggle/input/sherlock-holmes-stories/sherlock/stoc.txt
/kaggle/input/sherlock-holmes-stories/sherlock/nava.txt
/kaggle/input/sherlock-holmes-stories/sherlock/abbe.txt
/kaggle/input/sherlock-holmes-stories/sherlock/suss.txt
/kaggle/input/sherlock-holmes-stories/sherlock/miss.txt
/kaggle/input/sherlock-holmes-stories/sherlock/l

**Reads every text file and appends each line of text file into the array**

In [13]:
story_path = "/kaggle/input/sherlock-holmes-stories/sherlock/sherlock/"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------': break
                    if line!='':txt.append(line)
    return txt
        
stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


In [14]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)  # splitting the line into each word
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332247


In [15]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total 
    return markov_model

In [18]:
markov_model = make_markov_model(cleaned_stories)

In [19]:
print("number of states = ", len(markov_model.keys()))

number of states =  208716


In [20]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'was in': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'was whist': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'is afoot': 0.036036036036036036, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.02702702702702703, 'your letter': 0.027027027027027

In [21]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [22]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes i fear your grace write that day twenty or thirty i have time his name is 
1.  dear holmes you are ready we will start now not from the edge of it it was a 
2.  dear holmes am i such a very natural that he had did the boy having so affectionate a 
3.  dear holmes my previous letters and papers upon the table was set in a reminiscent voice it was 
4.  dear holmes i ejaculated my dear fellow pray come in the appearance of our inquiry immensely as a 
5.  dear holmes said i shall jot down the corridor and a little thought the turn which it has 
6.  dear holmes that i should intrude if i would go for what they will have followers said holmes 
7.  dear holmes i thought you were in the bedroom last night once the bridge was up i guess 
8.  dear holmes said i and the little wasted figure at first as they thought taken vengeance both upon 
9.  dear holmes that i could not say that a good deal mr holmes i fear there is one 
10.  dear holmes oh yes said he when first i took the an

In [None]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="my dear", limit=8))

In [None]:
print(generate_story(markov_model, start="the case", limit=100))
