In [45]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

# Loading the data
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words+= re.findall('\w+',line.lower())

### 1. Finding the pairs

In [46]:
def get_pairs(words,n):
    n = n+1
    data = []
    for i in range(len(words)-n): # to avoid index out of range exception
        data.append(' '.join(words[i:i+n]))
    return data

### 2. Finding occurrence Probabilities

In [47]:
def get_prob_dist(data):
    # Converting the whole data into a numpy array
    a = np.array(data)
    
    pair,count = np.unique(a,return_counts = True)
    unique_pairs = list(set(data))

    prob_dist = [] # Returns input pair, occurrence, output
    for i in range(len(unique_pairs)):
        prob_dist.append([unique_pairs[i],' '.join(unique_pairs[i].split(' ')[:-1]),unique_pairs[i].split(' ')[-1],count[i]])

    return prob_dist

In [82]:
data = get_pairs(words,4)
prob_dist = get_prob_dist(data)

### 3. Predicting the words

In [84]:
df = pd.DataFrame(prob_dist,columns = ['seq','inp','out','freq'])
df.head()

def predict(word):
    if len(df[df['inp'] == word]):
        df_ = df[df['inp'] == word]
        return df_.sort_values(by='freq').head()['out'].values
    else:
        print('Seq is not present')
predict('this is a beautiful')

array(['country'], dtype=object)

#### 3.1) Prediction for one word

In [85]:
predict('this is a beautiful')

array(['country'], dtype=object)

#### 3.2) Prediction with auto sequencing

In [113]:
def pred_seq(seq,n):
    output = []
    output.append(seq)

    for i in range(n):
        pred = predict(seq)
        seq = ' '.join(seq.split(' ')[1:]) + ' '+ pred[0]
        output.append(pred[0])

    return ' '.join(output)
    
pred_seq('of the united states',50)

'of the united states by charles a beard and mary r beard this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the bet addressing himself particularly to anatole and pierre'

In [115]:
data = get_pairs(words,10)
prob_dist = get_prob_dist(data)

df = pd.DataFrame(prob_dist, columns = ['seq','inp','out','freq'])

In [116]:
df.sort_values(by='freq')

Unnamed: 0,seq,inp,out,freq
0,despised them not because of his own intellect...,despised them not because of his own intellect...,knowledge,1
739198,whistle but the sudden glare flashing into my ...,whistle but the sudden glare flashing into my ...,made,1
739199,weather in some cases the tumour is diminished...,weather in some cases the tumour is diminished...,the,1
739200,thinking he could have been received in such a...,thinking he could have been received in such a...,only,1
739201,simplicity of his heart had ordered that they ...,simplicity of his heart had ordered that they ...,with,1
...,...,...,...,...
52299,along wrapped in her cloak she was only a coup...,along wrapped in her cloak she was only a couple,of,4
1019720,natasha they were fond of asking one another t...,natasha they were fond of asking one another t...,i,4
195683,with light mane and tail and when he rode it no,with light mane and tail and when he rode it,no,6
1102740,those around her she recovered from her mental...,those around her she recovered from her mental...,as,6
