In [1]:
import spacy
import numpy as np
import re

nlp = spacy.load('en_core_web_sm')

In [2]:
def nGrams(doc, n, isTokenized):
    if not isTokenized:
        doc = [token.text for token in nlp(doc)]

    doc = ' '.join(doc).lower().split(' ') #convert all to lowercase
    grams = [doc[i:i+n] for i in range(len(doc)-n+1)]
    # print (doc)
    return grams

In [3]:
doc = 'How old are you today or can you tell me something about yourself'
# doc = ['How', 'old', 'are', 'you', 'today']
n = 4
grams = nGrams(doc, n, False)
grams

[['how', 'old', 'are', 'you'],
 ['old', 'are', 'you', 'today'],
 ['are', 'you', 'today', 'or'],
 ['you', 'today', 'or', 'can'],
 ['today', 'or', 'can', 'you'],
 ['or', 'can', 'you', 'tell'],
 ['can', 'you', 'tell', 'me'],
 ['you', 'tell', 'me', 'something'],
 ['tell', 'me', 'something', 'about'],
 ['me', 'something', 'about', 'yourself']]

In [4]:
from collections import defaultdict

In [5]:
def buildModel():
    model = defaultdict(lambda: defaultdict(lambda: 0)) #eg. {x: {y: 0}}
    return model

In [6]:
def updateCount(nGram, model):
    w_1_to_n_minus_1 = tuple(nGram[:-1])
    w_n = nGram[-1]
    model[w_1_to_n_minus_1][w_n] += 1 #eg. {w_1_to_n_minus_1: {w_n: 1}}
    return model

In [7]:
def computeProbability(model):
    for w_1_to_n_minus_1 in model:
        totalCount = float(sum(model[w_1_to_n_minus_1].values()))
        for w_n in model[w_1_to_n_minus_1]:
            model[w_1_to_n_minus_1][w_n] /= totalCount
    return model

In [8]:
# !pip install dill #extention of pickle
import dill

In [74]:
def saveModel(model, fileName):
    with open('../Models/'+fileName, 'wb') as f:
        dill.dump(model, f)

def loadModel(fileName):
    with open('../Models/'+fileName, 'rb') as f:
        model = dill.load(f)
    return model

In [31]:
d1 = ['My name is Shatin', 'What is your name', 'And what is this']
d2 = ['My name is Arosh', 'What is your age']

In [32]:
n = 3
model = buildModel()

In [33]:
for doc in d1:
    for nGram in nGrams(doc, n, False):
        model = updateCount(nGram, model)

In [34]:
for doc in d2:
    for nGram in nGrams(doc, n, False):
        model = updateCount(nGram, model)

In [35]:
model = computeProbability(model)

In [36]:
model

defaultdict(<function __main__.buildModel.<locals>.<lambda>()>,
            {('my',
              'name'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'is': 1.0}),
             ('name',
              'is'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'shatin': 0.5, 'arosh': 0.5}),
             ('what',
              'is'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'your': 0.6666666666666666,
                          'this': 0.3333333333333333}),
             ('is',
              'your'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'name': 0.5, 'age': 0.5}),
             ('and',
              'what'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'is': 1.0})})

In [19]:
saveModel(model, 'test_model.pkl')

In [71]:
nextWord = ''
text = ['what','is']
nextWords = list(model[tuple(text[-n+1:])].keys()) #-n+ means last n-1 words from given list
probs = list(model[tuple(text[-n+1:])].values())
# if len(nextWords) > 0:
#     nextWord = np.random.choice(nextWords,1,probs)[0]
#     text.append(nextWord)
# else:
#     print('No Word Found')

if len(nextWords) > 0:
    nextWord = nextWords[np.argmax(probs)]
    text.append(nextWord)
else:
    print('No Word Found')
print(nextWord)

your


In [76]:
loaded_model = loadModel('nGram_model.pkl')

In [77]:
def sampleText(model, startingText=['after','that'], maxLength=100, nGramSize=3):
    text = startingText
    n = nGramSize
    while not len(text)>maxLength:
        nextWords = list(model[tuple(text[-n+1:])].keys()) #-n+ means last n-1 words from given list
        probs = list(model[tuple(text[-n+1:])].values())
        
        if len(nextWords) > 0:
            nextWord = nextWords[np.argmax(probs)]
            text.append(nextWord)
        else:
            break
        
    sampled = ' '.join(text)
    return sampled

In [78]:
for s in nlp(sampleText(loaded_model, ['what','are'])).sents:
    print(s)
    break

what are the only way to get the 
                                                                                            
