In [15]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

### 1. Finding the pairs

In [16]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words+= re.findall('\w+',line.lower())

#### Make pairs of 2 words for all words in 'words'
['The','Project']

['Project','Gutenberg']

['Gutenberg','EBook']

In [17]:
def get_pairs(words):
    data = []
    for i in range(len(words)-1): # to avoid index out of range exception
        data.append(' '.join(words[i:i+2]))
    return data

data = get_pairs(words)

In [18]:
len(data)

1115584

### 2. Finding occurrence Probabilities

In [19]:
data[:10]

['the project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook of',
 'of the',
 'the adventures',
 'adventures of',
 'of sherlock',
 'sherlock holmes',
 'holmes by']

In [20]:
a = np.array(data)
pair,count = np.unique(a,return_counts = True)

print('Total Pairs:',len(data))
unique_pairs = list(set(data))
print('Unique Pairs:',len(pair))

print('-'*50)

prob_dist = [] # Returns input pair, occurrence, output
for i in range(len(pair)):
    prob_dist.append([pair[i],count[i],pair[i].split(' ')[-1]])
    
print(len(prob_dist))
print('-'*50)

Total Pairs: 1115584
Unique Pairs: 390694
--------------------------------------------------
390694
--------------------------------------------------


In [21]:
pair[:10]

array(['0 05', '0 25', '0 45', '0 5', '0 6', '0 7', '0 9', '0 i', '00 99',
       '00 went'], dtype='<U30')

In [22]:
count[:10]

array([1, 1, 1, 1, 4, 1, 1, 1, 2, 1])

### 3. Predicting the words

In [23]:
df = pd.DataFrame(prob_dist,columns = ['pair','freq','out'])
# Remove all the pairs having frequency less than 5
df[df['freq'] >= 5]
df.head()

def predict(word):
    df_pred = []
    for i in df.values:
        if (i[0].split(' ')[0] == word):
            df_pred.append([i[0],i[1],i[2]])

    df_pred = pd.DataFrame(df_pred,columns = ['in','freq','out'])
    return list(df_pred.sort_values(by = 'freq',ascending=False).head()['out'].values)

In [24]:
predict('the')

['same', 'french', 'first', 'old', 'emperor']

In [25]:
word = 'the'
for i in range(10):
    pred = predict(word)
    word = pred[0]
    print(word,end=' ')

same time to the same time to the same time 

In [26]:
word = 'this'
preds = []
preds.append(word)
for i in range(10):
    pred = predict(word)
    print(pred)
    word = pred[int(input('Enter the index:'))]
    preds.append(word)
print('-'*30)
print(' '.join(preds))
print('-'*30)

['is', 'was', 'way', 'and', 'time']
Enter the index:2
['of', 'to', 'and', 'the', 'in']
Enter the index:1
['the', 'be', 'him', 'his', 'a']
Enter the index:3
['eyes', 'head', 'own', 'face', 'wife']
Enter the index:0
['and', 'were', 'with', 'fixed', 'at']
Enter the index:2
['the', 'a', 'his', 'her', 'him']
Enter the index:1
['man', 'few', 'long', 'little', 'very']
Enter the index:2
['time', 'as', 'been', 'and', 'bones']
Enter the index:3
['the', 'a', 'in', 'that', 'he']
Enter the index:4
['had', 'was', 'said', 'is', 'would']
Enter the index:0
------------------------------
this way to his eyes with a long and he had
------------------------------
