In [2]:
!pip install nltk kaggle



## Importing Kaggle JSON file

In [3]:
# configuring the path of kaggle.json file use ! before writing the funtion
!mkdir  -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Dataset directly through Kaggle

In [10]:
!kaggle datasets download -d idevji1/sherlock-holmes-stories

Dataset URL: https://www.kaggle.com/datasets/idevji1/sherlock-holmes-stories
License(s): CC0-1.0
Downloading sherlock-holmes-stories.zip to /content
 91% 9.00M/9.93M [00:00<00:00, 25.6MB/s]
100% 9.93M/9.93M [00:00<00:00, 28.7MB/s]


## Unzipping Dataset

In [11]:
from zipfile import ZipFile
unzipped_text_data = "/content/sherlock-holmes-stories.zip"
with ZipFile(unzipped_text_data,'r') as zip:
  zip.extractall()
  print('done')

done


## Importing Dependencies

In [12]:
import pandas as pd
import numpy as np
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import nltk

## Text pre-processing

In [13]:
story_path = "/content/sherlock/"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(os.path.join(story_path, file)) as f:
                for line in f:
                    line = line.strip()
                    if line == '-----------': break
                    if line != '': txt.append(line)
    return txt

stories = read_all_stories(story_path)
print("number of lines =", len(stories))


number of lines = 431326


In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
def clean_txt(txt):
  cleaned_txt = []
  for line in txt:
    line = line.lower()
    line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
    tokens = word_tokenize(line)
    words = [word for word in tokens if word.isalpha()]
    cleaned_txt += words
  return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  4675490


## Markov Chain Algorithm or model from scratch

In [16]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1

    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [17]:
markov_model = make_markov_model(cleaned_stories)

In [18]:
print("number of states = ", len(markov_model.keys()))

number of states =  208801


In [22]:
print("All possible transitions from a state \n")
print(markov_model['the game'])

All possible transitions from a state 

{'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'was whist': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'is afoot': 0.036036036036036036, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.02702702702702703, 'your letter': 0.02702702702702703}


## Next Word generating fn

In [23]:
def generate_text(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [27]:
for i in range(20):
    print(str(i)+". ", generate_text(markov_model, start="dear holmes", limit=8))

0.  dear holmes he has not entirely a wasted one my wife had some inkling of his property into 
1.  dear holmes am i addressing dr watson i am all this dr barnicot is an enthusiastic admirer of 
2.  dear holmes and tell him his features were peaky and sallow and his little game what did she 
3.  dear holmes oh yes no doubt that you have brought you something fresh inspector macdonald had been staring 
4.  dear holmes what do you make of that room absolutely nothing the landlady drew an envelope and examining 
5.  dear holmes if i should return tingling with anger i do not know but once inside the door 
6.  dear holmes if i could get a divorce from my brother this morning in company with a smart 
7.  dear holmes it is the only person who is a serious case against the son of sir charles 
8.  dear holmes oh yes i sent it from the front room while the elder to whose mercy i 
9.  dear holmes what do you think a man at the morass and again on july there was a 
10.  dear holmes said i but the

In [28]:
for i in range(20):
    print(str(i)+". ", generate_text(markov_model, start="my dear", limit=8))

0.  my dear watson said he laughing im an honest man though not marked under that name well you 
1.  my dear watson said she for three days last week he hurled him back mcginty released his hold 
2.  my dear holmes i fear there is some guilty secret which appeared from the steep roof there sprang 
3.  my dear fellow and i he had filled out his hand and a bundle of clothing comprising a 
4.  my dear watson that at two oclock yesterday afternoon a fortnight or so later i found myself it 
5.  my dear mr mac but how are we to give to the man that was more pleased than 
6.  my dear chap i had seen you say that no one not miss harrison sitting there in the 
7.  my dear watson was asking for your advice it is friday man your wife asked you jones i 
8.  my dear watson said he now let me talk about david that one word shall they have from 
9.  my dear fellow for a german very thin very wrinkled bent with age an opium pipe dangling down 
10.  my dear watson that the lady absolutely refused to ma

In [29]:
for i in range(20):
    print(str(i)+". ", generate_text(markov_model, start="i would", limit=8))

0.  i would have endured imprisonment ay even execution rather than in this there is laura lyons by representing 
1.  i would spend my life hiking round the neck of the excited spaniel he had risen from his 
2.  i would go away afterwards and each will supplement the efforts of the police are hurrying up the 
3.  i would say nothing to him yet that is precisely for that purpose youve got to the end 
4.  i would respond to such a plight before me its a fine lad a staunch lad nothing would 
5.  i would suggest for example that a presentation would be more trying to the rocks on the sea 
6.  i would suggest that mr cubitts body may now be found in the room with an energy and 
7.  i would not tell it if its any use in my resisting and that is what i would 
8.  i would be at st saviours near kings cross and we were going on within the last hours 
9.  i would part with them for as i say left these jewels to pay some accounts i was 
10.  i would have endured imprisonment ay even execution rat

In [34]:
for i in range(20):
    print(str(i)+". ", generate_text(markov_model, start="come on", limit=8))

0.  come on something later which will make england ring you will get food and so trace it to 
1.  come on mcmurdo and scanlan would put the powder above it and it is so lonely up there 
2.  come on suddenly in the middle of that prairie a fairly safe ground do we not perhaps you 
3.  come on my boy these were the main gate and share the watch with my punjaubees they were 
4.  come on he rushed into his pocket most likely never noticing that a corner and pointed up the 
5.  come on the line of his chair like an old black frock coat which with his own eyes 
6.  come on a visit indeed you seem to him that he would pass london bridge there is a 
7.  come on mcmurdo and scanlan strolled on with an ivory miniature and the artist takes in his own 
8.  come on watson we shall order you a completely new idea of the truth it was the work 
9.  come on a visit of a black villain and caught in a half gloom far away we could 
10.  come on he rushed at one time why whatever is emotional is opposed t

In [30]:
print(generate_text(markov_model, start="the case", limit=100))

the case precisely so i ought to make up for me in my dreams look out of my scientific methods of the police no no my dear sir if you are to wait here a moment i feared so much as you will find the facts themselves have often been so slight an obstacle why then was he that mr evans there is no use you may have gone out of the wedding it missed him he spoke and he galloped several miles before i fainted when it felt the hand of the man that he has disappeared and although also he is not worth your while to me the letter ill tell you so i have had nothing to do with much less than two miles to the north of oporto the proceedings and if browner had occasion to raise it the box and the trees were standing out in vivid relief upon the skin which on consideration of some few words of congratulation and then sat downcast with my head there was a sharp tap at the door of the manor house in consultation with his hand on a weapon which will be of national importance of the matter until 
