---
# Input

In [1]:
# Basic Libraries
import bs4 as bs
import urllib.request
import re

import nltk
from nltk import bigrams,trigrams
from collections import defaultdict

In [2]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')
article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()
article_text = re.sub(r'[^A-Za-z. ]', '', article_text)
article_text



---
# Preprocessing

In [3]:
tokenized_sent = nltk.word_tokenize(article_text)
tokenized_sent

['tennis',
 'is',
 'a',
 'racket',
 'sport',
 'that',
 'can',
 'be',
 'played',
 'individually',
 'against',
 'a',
 'single',
 'opponent',
 'singles',
 'or',
 'between',
 'two',
 'teams',
 'of',
 'two',
 'players',
 'each',
 'doubles',
 '.',
 'each',
 'player',
 'uses',
 'a',
 'tennis',
 'racket',
 'that',
 'is',
 'strung',
 'with',
 'cord',
 'to',
 'strike',
 'a',
 'hollow',
 'rubber',
 'ball',
 'covered',
 'with',
 'felt',
 'over',
 'or',
 'around',
 'a',
 'net',
 'and',
 'into',
 'the',
 'opponents',
 'court',
 '.',
 'the',
 'object',
 'of',
 'the',
 'game',
 'is',
 'to',
 'maneuver',
 'the',
 'ball',
 'in',
 'such',
 'a',
 'way',
 'that',
 'the',
 'opponent',
 'is',
 'not',
 'able',
 'to',
 'play',
 'a',
 'valid',
 'return',
 '.',
 'the',
 'player',
 'who',
 'is',
 'unable',
 'to',
 'return',
 'the',
 'ball',
 'will',
 'not',
 'gain',
 'a',
 'point',
 'while',
 'the',
 'opposite',
 'player',
 'will.tennis',
 'is',
 'an',
 'olympic',
 'sport',
 'and',
 'is',
 'played',
 'at',
 'all'

---
# Generating N-Gram Model

---
# 2-Gram

In [4]:
# Create a placeholder for model
model_2_gram = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for w1, w2 in bigrams(tokenized_sent):
    model_2_gram[(w1)][w2] += 1

# Let's transform the counts to probabilities
for w1 in model_2_gram:
    total_count = float(sum(model_2_gram[w1].values()))
    for w2 in model_2_gram[w1]:
        model_2_gram[w1][w2] /= total_count

In [5]:
model_2_gram

defaultdict(<function __main__.<lambda>()>,
            {'tennis': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'is': 0.031007751937984496,
                          'racket': 0.015503875968992248,
                          'originated': 0.007751937984496124,
                          '.': 0.08527131782945736,
                          'referred': 0.007751937984496124,
                          'not': 0.007751937984496124,
                          'have': 0.007751937984496124,
                          'has': 0.007751937984496124,
                          'and': 0.023255813953488372,
                          'courts': 0.007751937984496124,
                          'outdoors': 0.007751937984496124,
                          'player': 0.046511627906976744,
                          'from': 0.007751937984496124,
                          'declined': 0.007751937984496124,
                          'most': 0.007751937984496124,
                

---
# 3-Gram

In [6]:
# Create a placeholder for model
model_3_gram = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for w1, w2, w3 in trigrams(tokenized_sent, pad_right=True, pad_left=True):
    model_3_gram[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model_3_gram:
    total_count = float(sum(model_3_gram[w1_w2].values()))
    for w3 in model_3_gram[w1_w2]:
        model_3_gram[w1_w2][w3] /= total_count

In [7]:
model_3_gram

defaultdict(<function __main__.<lambda>()>,
            {(None,
              None): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'tennis': 1.0}),
             (None,
              'tennis'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'is': 1.0}),
             ('tennis',
              'is'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'a': 0.5,
                          'played': 0.25,
                          'the': 0.25}),
             ('is',
              'a'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'racket': 0.07142857142857142,
                          'let': 0.07142857142857142,
                          'serve': 0.07142857142857142,
                          'player': 0.07142857142857142,
                          'relevant': 0.07142857142857142,
                          'very': 0.07142857142857142,
                          'diagonal': 0.07142857142857142,
                          'way': 0.0714

---
# Provide the input

In [18]:
text_input = input('Enter the Text (Only 2 Words are Required) :')
text_tokenized = text_input.split(sep = ' ')

Enter the Text (Only 2 Words are Required) :cricket is


In [19]:
token_length = len(text_tokenized)

In [20]:
values_3_gram = model_3_gram.get((text_tokenized[token_length-2], text_tokenized[token_length-1]), '<NA>')
values_3_gram

'<NA>'

In [21]:
if(values_3_gram == '<NA>'):
    pred_ngrams = dict(model_2_gram[text_tokenized[token_length-1]])
else:
    pred_ngrams = dict(model_3_gram[text_tokenized[token_length-2],text_tokenized[token_length - 1]])

In [22]:
pred_ngrams

{'a': 0.07909604519774012,
 'strung': 0.005649717514124294,
 'to': 0.01694915254237288,
 'not': 0.03389830508474576,
 'unable': 0.011299435028248588,
 'an': 0.01694915254237288,
 'played': 0.04519774011299435,
 'also': 0.03954802259887006,
 'historys': 0.005649717514124294,
 'now': 0.01694915254237288,
 'believed': 0.005649717514124294,
 'where': 0.005649717514124294,
 'acknowledged': 0.005649717514124294,
 'due': 0.005649717514124294,
 'feet': 0.011299435028248588,
 'required': 0.022598870056497175,
 'stretched': 0.005649717514124294,
 'held': 0.011299435028248588,
 'unusual': 0.005649717514124294,
 'used': 0.011299435028248588,
 'referred': 0.011299435028248588,
 'called': 0.03954802259887006,
 'out': 0.011299435028248588,
 'often': 0.011299435028248588,
 'designated': 0.005649717514124294,
 'the': 0.0903954802259887,
 'decided': 0.005649717514124294,
 'ready': 0.005649717514124294,
 'void': 0.005649717514124294,
 'hit': 0.01694915254237288,
 'in': 0.01694915254237288,
 'considered':

In [24]:
import random
# Generating a random number
rand_no = random.uniform(0, 1)
print('rand_no',rand_no)

import numpy as np
position = np.where(np.array(list(pred_ngrams.values())).cumsum() > rand_no)[0][0]
print('position',position)

rand_no 0.36483716388469767
position 20


In [25]:
list(pred_ngrams.keys())[position]

'referred'

---
# Suggested Next Word

In [26]:
print('Next Suggested Word is :', list(pred_ngrams.keys())[position])

Next Suggested Word is : referred
