In [1]:
import nltk 
from nltk.tokenize import word_tokenize


In [2]:
# reading corpus 
# note you can use any corpuses of your choice

#writing all gutenberg corpus in one file
with open("result.txt", "w") as f:
    for filename in nltk.corpus.gutenberg.fileids():
        f.write(nltk.corpus.gutenberg.raw(filename))
        
        
# reading data file
with open("result.txt", "r") as f:
    data = f.read()


In [3]:
data[0:50]

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\n'

In [4]:
#approximate number of words
len(data.split())

2135242

In [5]:
#extract sents

# Split by newline character
sentences = data.split('\n')

# Remove leading and trailing spaces
sentences = [s.strip() for s in sentences]

# Drop Empty Sentences
sentences = [s for s in sentences if len(s) > 0]

sentences[0:5]

['[Emma by Jane Austen 1816]',
 'VOLUME I',
 'CHAPTER I',
 'Emma Woodhouse, handsome, clever, and rich, with a comfortable home',
 'and happy disposition, seemed to unite some of the best blessings']

In [6]:
# tokenized sentences
tokenized = []

# Iterate through sentences
for sentence in sentences:

    # Convert to lowercase
    sentence = sentence.lower()

    # Convert to a list of words
    token = word_tokenize(sentence)

    # Append to list
    tokenized.append(token)
    
    
tokenized[0:3]

[['[', 'emma', 'by', 'jane', 'austen', '1816', ']'],
 ['volume', 'i'],
 ['chapter', 'i']]

In [7]:
# building tri_dictionary dictionary
# tri_dic contains the frequency for every three words in each sentence

tri_dic = {}

for token_sent in tokenized:
    
    m = len(token_sent) -2
    
    for i in range(m):

        tri = token_sent[i:i+3]
        
        tri = f'{tri[0]}_{tri[1]}_{tri[2]}'
        
        if tri in tri_dic.keys():
            tri_dic[tri] += 1

        else:
            tri_dic[tri] = 1

        

In [8]:
tri_dic


{'[_emma_by': 1,
 'emma_by_jane': 1,
 'by_jane_austen': 3,
 'jane_austen_1816': 1,
 'austen_1816_]': 1,
 'emma_woodhouse_,': 4,
 'woodhouse_,_handsome': 1,
 ',_handsome_,': 5,
 'handsome_,_clever': 1,
 ',_clever_,': 1,
 'clever_,_and': 3,
 ',_and_rich': 4,
 'and_rich_,': 1,
 'rich_,_with': 1,
 ',_with_a': 376,
 'with_a_comfortable': 1,
 'a_comfortable_home': 2,
 'and_happy_disposition': 1,
 'happy_disposition_,': 1,
 'disposition_,_seemed': 1,
 ',_seemed_to': 18,
 'seemed_to_unite': 1,
 'to_unite_some': 1,
 'unite_some_of': 1,
 'some_of_the': 106,
 'of_the_best': 23,
 'the_best_blessings': 2,
 'of_existence_;': 2,
 'existence_;_and': 1,
 ';_and_had': 17,
 'and_had_lived': 3,
 'had_lived_nearly': 1,
 'lived_nearly_twenty-one': 1,
 'nearly_twenty-one_years': 1,
 'twenty-one_years_in': 1,
 'years_in_the': 10,
 'in_the_world': 233,
 'with_very_little': 4,
 'very_little_to': 5,
 'little_to_distress': 2,
 'to_distress_or': 1,
 'distress_or_vex': 1,
 'or_vex_her': 1,
 'vex_her_.': 2,
 'she_wa

In [9]:
len(tri_dic.keys())

1234365

In [10]:
# building bi_dictionary dictionary
# bi_dic contains the frequency for every two words in each sentence

bi_dic = {}

for token_sent in tokenized:
    
    m = len(token_sent) -1
    
    for i in range(m):

        bi = token_sent[i:i+2]

        
        bi = f'{bi[0]}_{bi[1]}'
        
        if bi in bi_dic.keys():
            bi_dic[bi] += 1

        else:
            bi_dic[bi] = 1


In [11]:
bi_dic

{'[_emma': 1,
 'emma_by': 1,
 'by_jane': 4,
 'jane_austen': 3,
 'austen_1816': 1,
 '1816_]': 1,
 'volume_i': 1,
 'chapter_i': 9,
 'emma_woodhouse': 4,
 'woodhouse_,': 118,
 ',_handsome': 8,
 'handsome_,': 30,
 ',_clever': 3,
 'clever_,': 17,
 ',_and': 38019,
 'and_rich': 9,
 'rich_,': 36,
 ',_with': 1905,
 'with_a': 1711,
 'a_comfortable': 17,
 'comfortable_home': 3,
 'and_happy': 32,
 'happy_disposition': 1,
 'disposition_,': 20,
 ',_seemed': 76,
 'seemed_to': 386,
 'to_unite': 4,
 'unite_some': 1,
 'some_of': 265,
 'of_the': 18177,
 'the_best': 268,
 'best_blessings': 2,
 'of_existence': 5,
 'existence_;': 5,
 ';_and': 6705,
 'and_had': 220,
 'had_lived': 16,
 'lived_nearly': 1,
 'nearly_twenty-one': 1,
 'twenty-one_years': 1,
 'years_in': 47,
 'in_the': 9764,
 'the_world': 813,
 'with_very': 15,
 'very_little': 86,
 'little_to': 22,
 'to_distress': 5,
 'distress_or': 1,
 'or_vex': 1,
 'vex_her': 3,
 'her_.': 500,
 'she_was': 1051,
 'was_the': 868,
 'the_youngest': 24,
 'youngest_of'

In [12]:
# building tri-gram dictionary model 
#trigram_dic contains all possible third words and theirs probability based on the frequency of the two words before.


trigram_dic = {}

for key in tri_dic.keys():
    
    k = key.split('_')
    
    b = f"{k[0]}_{k[1]}"

    try:
        prob = tri_dic[key]/bi_dic[b]
    except Exception as e:
#         print(e)
        pass
        
    
    if b in trigram_dic.keys():
        trigram_dic[b][k[2]] = prob
        
        
    else:
        trigram_dic[b] = {k[2]:prob}

        
    

In [13]:
len(trigram_dic.keys())

486844

In [14]:
trigram_dic


{'[_emma': {'by': 1.0},
 'emma_by': {'jane': 1.0},
 'by_jane': {'austen': 0.75, 'herself': 0.25},
 'jane_austen': {'1816': 0.3333333333333333,
  '1818': 0.3333333333333333,
  '1811': 0.3333333333333333},
 'austen_1816': {']': 1.0},
 'emma_woodhouse': {',': 1.0},
 'woodhouse_,': {'handsome': 0.00847457627118644,
  'if': 0.00847457627118644,
  'and': 0.0423728813559322,
  'understanding': 0.00847457627118644,
  'full': 0.00847457627118644,
  'he': 0.01694915254237288,
  'is': 0.00847457627118644,
  'to': 0.01694915254237288,
  'do': 0.025423728813559324,
  'tell': 0.00847457627118644,
  'as': 0.025423728813559324,
  'who': 0.059322033898305086,
  "''": 0.06779661016949153,
  'what': 0.03389830508474576,
  'that': 0.01694915254237288,
  'shaking': 0.00847457627118644,
  'always': 0.01694915254237288,
  'a': 0.025423728813559324,
  'his': 0.00847457627118644,
  'so': 0.01694915254237288,
  'i': 0.059322033898305086,
  'you': 0.059322033898305086,
  'there': 0.00847457627118644,
  'ours': 0

In [15]:
#testing if it works
s = input('Enter text:')
s = s.split()

q = s[-2]+'_'+s[-1]

try:
    l = trigram_dic[q]
    l = sorted(l.items(), key=lambda item: item[1],reverse=True)
    print(l[0:5])
    
except Exception:
    print([])

Enter text:this is
[('the', 0.3515509601181684), ('a', 0.09748892171344166), ('not', 0.0413589364844904), ('my', 0.03840472673559823), ('all', 0.019202363367799114)]


In [16]:
# saving the trigram dictionary 
import pickle
with open('trigram_dictionary.pkl', 'wb') as f:
    pickle.dump(trigram_dic, f)
