In [54]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.book import *

In [55]:
def Save_Text(Data, Path):
    with open(Path, 'w') as f:
        f.write(Data)

def Clean_text(Data):
    Data = re.sub(r'[^a-zA-Z]', ' ', Data)
    Data = re.sub(r'\s+', ' ', Data)
    return Data

def Tokenize(Data):
    return Data.split()
                        
def Read_Docs(Docs_List):
    Data = ''
    for doc in Docs_List:
        for word in doc:
            Data += word + ' '
    Data = Data.lower()
    Save_Text(Data, './Data.txt')
    Data = Clean_text(Data)
    Data = Tokenize(Data)
    return Data

Data = Read_Docs([text1, text2, text3, text4, text5, text6, text7, text8, text9])
Data[:10]

['moby',
 'dick',
 'by',
 'herman',
 'melville',
 'etymology',
 'supplied',
 'by',
 'a',
 'late']

In [56]:
class Shannon_Game():
    def __init__(self, Data, ngram):
        self.ngram = ngram
        self.Data = Data
        self.__Create_Vocabulary()
        
    def __Create_Vocabulary(self):
        '''
        Function to create vocabulary of n-grams
        '''
        self.Vocabulary = {}
        for i in range(len(self.Data) - self.ngram):
            gram = ' '.join(self.Data[i:i+self.ngram])
            if gram not in self.Vocabulary.keys():
                self.Vocabulary[gram] = 1
            else:
                self.Vocabulary[gram] += 1
        self.Vocabulary_Size = len(self.Vocabulary)
        for key in self.Vocabulary.keys():
            self.Vocabulary[key] /= self.Vocabulary_Size

    def __Extrinsic_Evaluation(self,Example):
        '''
        Function to calculate perplexity of a sentence given completed sentence.
        Range of Perplexity is [1, inf). Target is to minimize perplexity.
        '''
        NGrams = Example.split()
        Num_Tokens = len(NGrams)
        NGrams = [' '.join(NGrams[i:i+self.ngram]) for i in range(Num_Tokens - self.ngram + 1)]

        Probability = 0
        for gram in NGrams:
            Probability += np.log10(self.Vocabulary[gram])
        Probability = np.exp(Probability)
        Perplexity = (Probability)**(-1.0/Num_Tokens)
        return Perplexity

    def Lets_Play(self,Example):
        '''
        Function to predict next word given a sentence
        '''
        PreviousWords = Example.split()[-self.ngram+1:]
        PreviousWords = ' '.join(PreviousWords)

        Next_Word_Candidates = {}
        for key in self.Vocabulary.keys():
            if key.startswith(PreviousWords):
                Next_Word_Candidates[key] = self.Vocabulary[key]
        Next_Word_Candidates = sorted(Next_Word_Candidates.items(), key=lambda x: x[1], reverse=True)
        NextWord = Next_Word_Candidates[0][0].split()[-1]

        Perplexity = self.__Extrinsic_Evaluation(Example + ' ' + NextWord)
        return NextWord, Perplexity

In [57]:
Khel = Shannon_Game(Data, 3)

Sample_Examples = ['the wounded whale to', 'floundered in the', 'ship on the ocean without being']
for Example in Sample_Examples:
    NextWord,Perplexity = Khel.Lets_Play(Example)
    print(Example, NextWord, 'with perplexity ->', Perplexity)

the wounded whale to be with perplexity -> 28.181690550287886
floundered in the world with perplexity -> 10.601340485946048
ship on the ocean without being struck with perplexity -> 54.970312495941684
