<a href="https://colab.research.google.com/github/SarthakJangade/AI_Model_Age-Gender_Detector/blob/main/Next_word_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Steps to build the next word recommender system

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

# **1. Loading and exploring the dataset**

In [None]:
#loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [None]:
# mounting the drive
from google .colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# opent text file and read in data
with open("/content/drive/MyDrive/NLP/dialogs_dataset","rb") as f:
  dialogs = pickle.load(f)

In [None]:
# numbers of text sequences
len(dialogs)

64776

In [None]:
#print 10 random dialogs
random.sample(dialogs, 10)

['Thank you so much',
 " You're going to order from the one on Broad Street, right?",
 "I'm going to keep it simple",
 'I prefer prince Street Pizza place',
 'okay ill take a specality pizza instead',
 ' how soon can I pick up the order',
 "Hey Jane, I'd like to see Glass tonight",
 'That works for me',
 'I want an Iced White Chocolate Mocha',
 ' How is the line coming? I feel bad for sending you now']

In [None]:
# text cleaning
dialogs_clean = []

for i in dialogs:
  #remove everything except alphabet, ' and whit spaces
  i=re.sub("[^a-zA-Z' ]","",i)
  #convert text to lowercase
  i= i.lower()
  # add cleaned text to the list
  dialogs_clean.append(i)

In [None]:
random.sample(dialogs_clean, 10)

['is there anything sooner than that i need my car fixed as soon as possible',
 'nope that sounds perfect',
 'i am at the west village apartments',
 'jeez thats nuts',
 "so no scallops tonight ok that's the reason i want to go to that particular restaurant",
 'okay that works for me',
 'yes we are',
 'cocoa beach florida location',
 ' ho wmany do you usually put in',
 ' do you know any local upscale restaurants near me']

In [None]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(dialogs_clean).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:
  # check if the word is already in dictionary
  if word in words_dict:
    # increment count of word by 1
    words_dict[word]=words_dict[word] + 1
  else:
    # add the word to dictionary with count 1
    words_dict[word] = 1

In [None]:
# word dictionary
words_dict

{'hi': 1472,
 "i'm": 2042,
 'looking': 490,
 'to': 14000,
 'book': 1218,
 'a': 13380,
 'table': 662,
 'for': 7709,
 'korean': 15,
 'fod': 1,
 'somewhere': 119,
 'in': 4433,
 'southern': 6,
 'nyc': 16,
 'maybe': 233,
 'the': 15406,
 'east': 46,
 'village': 21,
 'we': 1355,
 "don't": 786,
 'want': 3408,
 'sit': 134,
 'at': 2851,
 'bar': 197,
 'but': 976,
 'anywhere': 42,
 'else': 335,
 'is': 6936,
 'fine': 1589,
 'what': 3400,
 'times': 334,
 'are': 2175,
 'available': 878,
 'yikes': 6,
 "can't": 173,
 'do': 3571,
 'those': 305,
 'let': 575,
 'me': 5843,
 'check': 533,
 'great': 2262,
 "let's": 860,
 'that': 8048,
 'no': 4073,
 "that's": 2710,
 'it': 7737,
 'just': 2464,
 'i': 19654,
 'would': 4047,
 'like': 5275,
 'see': 1537,
 'if': 1533,
 'movie': 1097,
 'men': 23,
 'playing': 116,
 'here': 368,
 'yes': 4644,
 'and': 6534,
 'friend': 302,
 'so': 2224,
 'two': 1292,
 'tickets': 1505,
 'please': 4036,
 'time': 1297,
 'moving': 5,
 'today': 818,
 'about': 1511,
 'oh': 954,
 'can': 5653,


In [None]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

In [None]:
# words with least frequency
words_df.head()

Unnamed: 0,word,count
0,uppermiddle,1
1,shoots,1
2,geesh,1
3,andrea,1
4,precice,1


In [None]:
# words with highest frequency
words_df.tail()

Unnamed: 0,word,count
11142,you,11909
11143,a,13380
11144,to,14000
11145,the,15406
11146,i,19654


In [None]:
# vocabulary size
len(words_df)

11147

## 2. Creating N-grams of the dialogue

In [None]:
# creating an empty dataframe
dataset = pd.DataFrame()

# adding cleaned sentences in the dataframe
dataset['Sentences'] = dialogs_clean

# first 20 cleaned sentences
dataset.head(20)

Unnamed: 0,Sentences
0,hi i'm looking to book a table for korean fod
1,somewhere in southern nyc maybe the east village
2,we don't want to sit at the bar but anywhere ...
3,what times are available
4,yikes we can't do those times
5,let me check
6,great let's book that
7,no that's it just book
8,hi i would like to see if the movie what men w...
9,yes for me and a friend so two tickets please


In [None]:
# using .split() to get tokens from the sentence
dataset['Sentences'][0].split()

['hi', "i'm", 'looking', 'to', 'book', 'a', 'table', 'for', 'korean', 'fod']

In [None]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence
    return unigram_list

In [None]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [None]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [None]:
# creating unigrams for all the sentences in the dataset
final_unigram = []
# for each sentence
for i in range(dataset.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [None]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [None]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [None]:
# first 20 rows of the dataset
dataset.head(20)

Unnamed: 0,Sentences,unigram,bigram,trigram
0,hi i'm looking to book a table for korean fod,"[[hi], [i'm], [looking], [to], [book], [a], [t...","[[hi, i'm], [i'm, looking], [looking, to], [to...","[[hi, i'm, looking], [i'm, looking, to], [look..."
1,somewhere in southern nyc maybe the east village,"[[somewhere], [in], [southern], [nyc], [maybe]...","[[somewhere, in], [in, southern], [southern, n...","[[somewhere, in, southern], [in, southern, nyc..."
2,we don't want to sit at the bar but anywhere ...,"[[we], [don't], [want], [to], [sit], [at], [th...","[[we, don't], [don't, want], [want, to], [to, ...","[[we, don't, want], [don't, want, to], [want, ..."
3,what times are available,"[[what], [times], [are], [available]]","[[what, times], [times, are], [are, available]]","[[what, times, are], [times, are, available]]"
4,yikes we can't do those times,"[[yikes], [we], [can't], [do], [those], [times]]","[[yikes, we], [we, can't], [can't, do], [do, t...","[[yikes, we, can't], [we, can't, do], [can't, ..."
5,let me check,"[[let], [me], [check]]","[[let, me], [me, check]]","[[let, me, check]]"
6,great let's book that,"[[great], [let's], [book], [that]]","[[great, let's], [let's, book], [book, that]]","[[great, let's, book], [let's, book, that]]"
7,no that's it just book,"[[no], [that's], [it], [just], [book]]","[[no, that's], [that's, it], [it, just], [just...","[[no, that's, it], [that's, it, just], [it, ju..."
8,hi i would like to see if the movie what men w...,"[[hi], [i], [would], [like], [to], [see], [if]...","[[hi, i], [i, would], [would, like], [like, to...","[[hi, i, would], [i, would, like], [would, lik..."
9,yes for me and a friend so two tickets please,"[[yes], [for], [me], [and], [a], [friend], [so...","[[yes, for], [for, me], [me, and], [and, a], [...","[[yes, for, me], [for, me, and], [me, and, a],..."


In [None]:
# sample sentence
dataset['Sentences'][0]

"hi i'm looking to book a table for korean fod"

In [None]:
# unigram of the sentence
dataset['unigram'][0]

[['hi'],
 ["i'm"],
 ['looking'],
 ['to'],
 ['book'],
 ['a'],
 ['table'],
 ['for'],
 ['korean'],
 ['fod']]

In [None]:
# bigram of the sentence
dataset['bigram'][0]

[['hi', "i'm"],
 ["i'm", 'looking'],
 ['looking', 'to'],
 ['to', 'book'],
 ['book', 'a'],
 ['a', 'table'],
 ['table', 'for'],
 ['for', 'korean'],
 ['korean', 'fod']]

In [None]:
# trigram of the sentence
dataset['trigram'][0]

[['hi', "i'm", 'looking'],
 ["i'm", 'looking', 'to'],
 ['looking', 'to', 'book'],
 ['to', 'book', 'a'],
 ['book', 'a', 'table'],
 ['a', 'table', 'for'],
 ['table', 'for', 'korean'],
 ['for', 'korean', 'fod']]

### 3. Building the N-gram Language Model

In [None]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
for i in range(dataset.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

In [None]:
# defined model
model

defaultdict(<function __main__.<lambda>()>,
            {('hi',
              "i'm"): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'looking': 31,
                          'in': 5,
                          'driving': 5,
                          'the': 1,
                          'trying': 5,
                          'really': 1,
                          'thinking': 3,
                          'going': 3,
                          'running': 2,
                          'interested': 2,
                          'craving': 1,
                          'on': 1,
                          'kind': 1,
                          'wondering': 2,
                          'calling': 3,
                          'hungry': 2,
                          'having': 2,
                          'currently': 1,
                          'wanting': 1}),
             ("i'm",
              'looking'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'to': 51,
         

## 4. Predicting the next word using N-gram Language Model

In [None]:
# predict the next word
dict(model["to", "book"])

{'a': 186,
 'an': 107,
 'at': 9,
 'for': 5,
 'me': 16,
 'some': 13,
 'your': 3,
 'it': 9,
 'the': 11,
 'reservations': 6,
 'this': 3,
 'tickets': 6,
 'two': 6,
 'flight': 1,
 'with': 2,
 'reservation': 1,
 'shared': 1,
 'that': 3,
 'movie': 4,
 'there': 3,
 'four': 1,
 'any': 1,
 'my': 2,
 'through': 1,
 'us': 1,
 'and': 1,
 'anything': 1,
 'anymore': 1,
 "patty's": 1,
 'later': 1,
 'something': 1,
 'uberxl': 1,
 'dinner': 1}

In [None]:
# another example
dict(model["my", "name"])

{'is': 45,
 'which': 2,
 'scott': 1,
 'to': 1,
 'please': 4,
 'gina': 1,
 'hard': 1,
 'and': 16,
 'scolar': 1,
 'stanley': 1,
 'carter': 1,
 'karl': 1,
 'brittany': 1,
 'or': 1,
 'for': 1,
 'instead': 1,
 'alex': 1,
 'jonathan': 1,
 'jan': 1,
 'mathius': 1,
 'tom': 1,
 'right': 1,
 'rob': 1,
 'on': 1}

In [None]:
# another example
dict(model["how", "are"])

{'you': 62, 'we': 1, 'they': 2, 'my': 1, 'ya': 1, 'the': 2}

In [None]:
# another example
dict(model["good", "to"])

{'know': 20,
 'me': 56,
 'go': 10,
 'hear': 6,
 'the': 1,
 'you': 2,
 'drive': 1,
 'watch': 1,
 'eat': 1,
 'pass': 1,
 'bring': 1}

## Probabilistic Output

In [None]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:
        # check if the word is already in dictionary
        if word[0] in unigram_dict:
            # increment count of word by 1
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1
            unigram_dict[word[0]] = 1

100%|██████████| 64776/64776 [00:00<00:00, 101766.02it/s]


In [None]:
# unigram list
unigram_dict


{'hi': 1472,
 "i'm": 2042,
 'looking': 490,
 'to': 14000,
 'book': 1218,
 'a': 13380,
 'table': 662,
 'for': 7709,
 'korean': 15,
 'fod': 1,
 'somewhere': 119,
 'in': 4433,
 'southern': 6,
 'nyc': 16,
 'maybe': 233,
 'the': 15406,
 'east': 46,
 'village': 21,
 'we': 1355,
 "don't": 786,
 'want': 3408,
 'sit': 134,
 'at': 2851,
 'bar': 197,
 'but': 976,
 'anywhere': 42,
 'else': 335,
 'is': 6936,
 'fine': 1589,
 'what': 3400,
 'times': 334,
 'are': 2175,
 'available': 878,
 'yikes': 6,
 "can't": 173,
 'do': 3571,
 'those': 305,
 'let': 575,
 'me': 5843,
 'check': 533,
 'great': 2262,
 "let's": 860,
 'that': 8048,
 'no': 4073,
 "that's": 2710,
 'it': 7737,
 'just': 2464,
 'i': 19654,
 'would': 4047,
 'like': 5275,
 'see': 1537,
 'if': 1533,
 'movie': 1097,
 'men': 23,
 'playing': 116,
 'here': 368,
 'yes': 4644,
 'and': 6534,
 'friend': 302,
 'so': 2224,
 'two': 1292,
 'tickets': 1505,
 'please': 4036,
 'time': 1297,
 'moving': 5,
 'today': 818,
 'about': 1511,
 'oh': 954,
 'can': 5653,


In [None]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)
counts

Counter({'hi': 1472,
         "i'm": 2042,
         'looking': 490,
         'to': 14000,
         'book': 1218,
         'a': 13380,
         'table': 662,
         'for': 7709,
         'korean': 15,
         'fod': 1,
         'somewhere': 119,
         'in': 4433,
         'southern': 6,
         'nyc': 16,
         'maybe': 233,
         'the': 15406,
         'east': 46,
         'village': 21,
         'we': 1355,
         "don't": 786,
         'want': 3408,
         'sit': 134,
         'at': 2851,
         'bar': 197,
         'but': 976,
         'anywhere': 42,
         'else': 335,
         'is': 6936,
         'fine': 1589,
         'what': 3400,
         'times': 334,
         'are': 2175,
         'available': 878,
         'yikes': 6,
         "can't": 173,
         'do': 3571,
         'those': 305,
         'let': 575,
         'me': 5843,
         'check': 533,
         'great': 2262,
         "let's": 860,
         'that': 8048,
         'no': 4073,
         "that'

In [None]:
# vocabulary size
total_count = len(unigram_dict)
total_count


11147

In [None]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

counts

Counter({'hi': 0.132053467300619,
         "i'm": 0.18318830178523368,
         'looking': 0.043958015609581055,
         'to': 1.2559433031308873,
         'book': 0.10926706737238719,
         'a': 1.2003229568493765,
         'table': 0.059388176190903384,
         'for': 0.6915762088454293,
         'korean': 0.0013456535390688076,
         'fod': 8.971023593792052e-05,
         'somewhere': 0.010675518076612541,
         'in': 0.3976854759128017,
         'southern': 0.0005382614156275231,
         'nyc': 0.0014353637750067283,
         'maybe': 0.02090248497353548,
         'the': 1.3820758948596035,
         'east': 0.004126670853144344,
         'village': 0.0018839149546963309,
         'we': 0.1215573696958823,
         "don't": 0.07051224544720552,
         'want': 0.3057324840764331,
         'sit': 0.012021171615681349,
         'at': 0.2557638826590114,
         'bar': 0.017672916479770342,
         'but': 0.08755719027541042,
         'anywhere': 0.0037678299093926618,
 

In [None]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [None]:
# predict the next word
dict(model["to", "book"])

{'a': 0.4547677261613695,
 'an': 0.26161369193154055,
 'at': 0.02200488997555014,
 'for': 0.012224938875305633,
 'me': 0.03911980440097802,
 'some': 0.03178484107579465,
 'your': 0.00733496332518338,
 'it': 0.02200488997555014,
 'the': 0.02689486552567239,
 'reservations': 0.01466992665036676,
 'this': 0.00733496332518338,
 'tickets': 0.01466992665036676,
 'two': 0.01466992665036676,
 'flight': 0.0024449877750611264,
 'with': 0.004889975550122253,
 'reservation': 0.0024449877750611264,
 'shared': 0.0024449877750611264,
 'that': 0.00733496332518338,
 'movie': 0.009779951100244506,
 'there': 0.00733496332518338,
 'four': 0.0024449877750611264,
 'any': 0.0024449877750611264,
 'my': 0.004889975550122253,
 'through': 0.0024449877750611264,
 'us': 0.0024449877750611264,
 'and': 0.0024449877750611264,
 'anything': 0.0024449877750611264,
 'anymore': 0.0024449877750611264,
 "patty's": 0.0024449877750611264,
 'later': 0.0024449877750611264,
 'something': 0.0024449877750611264,
 'uberxl': 0.00244

In [None]:
# another example
dict(model["how", "are"])

{'you': 0.8985507246376813,
 'we': 0.014492753623188408,
 'they': 0.028985507246376815,
 'my': 0.014492753623188408,
 'ya': 0.014492753623188408,
 'the': 0.028985507246376815}

In [None]:
# another example
dict(model["good", "to"])

{'know': 0.2,
 'me': 0.56,
 'go': 0.1,
 'hear': 0.06,
 'the': 0.01,
 'you': 0.02,
 'drive': 0.01,
 'watch': 0.01,
 'eat': 0.01,
 'pass': 0.01,
 'bring': 0.01}