In [1]:
import spacy
import numpy as np
import re

nlp = spacy.load('en_core_web_sm')

In [2]:
def nGrams(doc, n, isTokenized):
    if not isTokenized:
        doc = [token.text for token in nlp(doc)]

    doc = ' '.join(doc).lower().split(' ') #convert all to lowercase
    grams = [doc[i:i+n] for i in range(len(doc)-n+1)]
    # print (doc)
    return grams

In [3]:
doc = 'How old are you today or can you tell me something about yourself'
# doc = ['How', 'old', 'are', 'you', 'today']
n = 4
grams = nGrams(doc, n, False)
grams

[['how', 'old', 'are', 'you'],
 ['old', 'are', 'you', 'today'],
 ['are', 'you', 'today', 'or'],
 ['you', 'today', 'or', 'can'],
 ['today', 'or', 'can', 'you'],
 ['or', 'can', 'you', 'tell'],
 ['can', 'you', 'tell', 'me'],
 ['you', 'tell', 'me', 'something'],
 ['tell', 'me', 'something', 'about'],
 ['me', 'something', 'about', 'yourself']]

In [4]:
from collections import defaultdict

In [5]:
def buildModel():
    model = defaultdict(lambda: defaultdict(lambda: 0)) #eg. {x: {y: 0}}
    return model

In [6]:
def updateCount(nGram, model):
    w_1_to_n_minus_1 = tuple(nGram[:-1])
    w_n = nGram[-1]
    model[w_1_to_n_minus_1][w_n] += 1 #eg. {w_1_to_n_minus_1: {w_n: 1}}
    return model

In [7]:
def computeProbability(model):
    for w_1_to_n_minus_1 in model:
        totalCount = float(sum(model[w_1_to_n_minus_1].values()))
        for w_n in model[w_1_to_n_minus_1]:
            model[w_1_to_n_minus_1][w_n] /= totalCount
    return model

In [8]:
# !pip install dill #extention of pickle
import dill

In [28]:
def saveModel(model, fileName):
    with open('../Models/'+fileName, 'wb') as f:
        dill.dump(model, f)

def loadModel(fileName):
    with open(fileName, 'rb') as f:
        model = dill.load(f)
    return model

In [22]:
d1 = ['My name is Shatin', 'What is your name']
d2 = ['My name is Arosh', 'What is your age']

In [23]:
n = 3
model = buildModel()

In [24]:
for doc in d1:
    for nGram in nGrams(doc, n, False):
        model = updateCount(nGram, model)

In [25]:
for doc in d2:
    for nGram in nGrams(doc, n, False):
        model = updateCount(nGram, model)

In [26]:
model = computeProbability(model)

In [33]:
saveModel(model, 'test_model.pkl')

In [16]:
model

defaultdict(<function __main__.buildModel.<locals>.<lambda>()>,
            {('my',
              'name'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'is': 1.0}),
             ('name',
              'is'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'shatin': 0.5, 'arosh': 0.5}),
             ('what',
              'is'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'your': 1.0}),
             ('is',
              'your'): defaultdict(<function __main__.buildModel.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'name': 0.5, 'age': 0.5})})

In [17]:
text = ['what','is']
nextWord = list(model[tuple(text[-n+1:])].keys())
probs = list(model[tuple(text[-n+1:])].values())

In [18]:
nextWord

['your']