In [16]:
import math
import random
import numpy as np
import pandas as pd
import nltk
#nltk.download('punkt')
#nltk.data.path.append('.')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
with open("./Data/en_US.twitter.txt", "r",encoding="utf8") as f:
    tw_data = f.read()
print("Data type:", type(tw_data))
print("Number of letters:", len(tw_data))
print("First 300 letters of the data")
print("-------")
display(tw_data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(tw_data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 letters of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


### Preprocessing Data

In [18]:
def tokenize_data(data: str):
    '''The function removes the /n from the text file and tokenixes the sentences
    Returns : list of lists (eg: [['I', 'Love','cats'], ['I', 'hate', 'dogs']])
    '''
    sentences=data.split('/n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]  ## ['I Love cats',I hate'dogs']]

    tokenized_sentences = []
    for sentence in sentences: 
        sentence = sentence.lower()
        tokenized = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokenized)
    
    return tokenized_sentences

In [19]:
def count_words(tokenized_sentences: list):
    '''
    tokenized_sentences: a list of lists with token of each sentences
    returns: dict for count of each word
    '''
    word_counts={}
    for sentence in tokenized_sentences:
        for word in sentence:
            word_counts[word]=word_counts.get(word, 0)+1
    return word_counts

## Test the above function
test_data=[['I', 'Love','cats'], ['I', 'hate', 'dogs'],['I', 'Love', 'this', 'song']]
count_words(test_data)

{'I': 3, 'Love': 2, 'cats': 1, 'hate': 1, 'dogs': 1, 'this': 1, 'song': 1}

In [20]:
def create_vocab(tokenized_sentences, min_freq=2):
    '''
    tokenized_sentences: a list of lists with token of each sentences
    min_freq: (default=2) In vocabulary oly those words are to be kept whose freq of occurance in equal of greater than min_freq

    returns: A list , Our vocabulary
    '''
    vocab=[]
    word_counts=count_words(tokenized_sentences)
    for word, count in word_counts.items():
        if count>=min_freq:
            vocab.append(word)
    return vocab

## Test the above function
test_data=[['i', 'Love','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
           ['this','song', 'is','my','sould' ]]
create_vocab(test_data)

['i', 'Love', 'cats', 'this', 'song']

In [21]:
def remove_out_of_vocab_words(tokenized_sentences: list, vocabulary:list,unknown_token='<unk>'):
    '''
    it removes the out of vocabulary words from the tokenized sentences

    Returns: A list of list i.e tokenized corpus from which out of vocabulary words sentences are removed
    '''
    vocabulary=set(vocabulary) ### for speed
    replaced_tokenized_sentences=[]

    for sentence in tokenized_sentences:
        replace_sentences=[]

        for token in sentence:
            if token in vocabulary:
                replace_sentences.append(token)
            else:
                replace_sentences.append(unknown_token)
        replaced_tokenized_sentences.append(replace_sentences)

    return replaced_tokenized_sentences

## Testing Above Function
test_token_scentence=[['i', 'Love','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
                      ['this','song', 'is','my','sould' ]]
vocab=create_vocab(test_token_scentence)
print(vocab)
remove_out_of_vocab_words(test_token_scentence,vocab)


['i', 'Love', 'cats', 'this', 'song']


[['i', 'Love', 'cats'],
 ['cats', '<unk>', '<unk>'],
 ['i', 'Love', 'this', 'song'],
 ['this', 'song', '<unk>', '<unk>', '<unk>']]

### Creating N-Gram Model

In [22]:
def create_n_counts(data: list,n: int, start_token='<s>', end_token='<e>'):
    '''
    Following function create a dictionnary for n-gram

    data: Tonkenized sentences from which unknown words are removed
    n: N in N-gram
    Start_token:
    end_token:

    Returns : A dictionary of n-gram counts
    '''
    n_grams_count={}
    for sentence in data:
        sentence=[start_token]*n+sentence+[end_token]
        sentence=tuple(sentence)

        for i in range(0, len(sentence)-n+1):
            new_gram=sentence[i:i+n]
            n_grams_count[new_gram]=n_grams_count.get(new_gram,0)+1
    return n_grams_count

## Test function
test_token=[['they','all','hate','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
             ['cats','hate','everything' ]]

create_n_counts(test_token, 2)


{('<s>', '<s>'): 4,
 ('<s>', 'they'): 1,
 ('they', 'all'): 1,
 ('all', 'hate'): 1,
 ('hate', 'cats'): 1,
 ('cats', '<e>'): 1,
 ('<s>', 'cats'): 2,
 ('cats', 'hate'): 2,
 ('hate', 'dogs'): 1,
 ('dogs', '<e>'): 1,
 ('<s>', 'i'): 1,
 ('i', 'Love'): 1,
 ('Love', 'this'): 1,
 ('this', 'song'): 1,
 ('song', '<e>'): 1,
 ('hate', 'everything'): 1,
 ('everything', '<e>'): 1}

In [23]:
def estimate_probability(word:str, previous_n_gram:list, n_gram_counts:dict, n_plus1_counts:dict, vocab_size:int, smoothing_parameter=1):
    '''
    Given a word and a previous n-gram, it computes the probability that given word comes after n-gram
    Returns: int
    '''
    n_plus1_gram=previous_n_gram+[word]
    n_plus1_gram=tuple(n_plus1_gram)
    previous_n_gram=tuple(previous_n_gram)
    
    previous_n_gram_count=n_gram_counts.get(previous_n_gram,0)
    denominator=previous_n_gram_count+(vocab_size*smoothing_parameter)

    current_n_gram_count=n_plus1_counts.get(n_plus1_gram,0)
    numerator=current_n_gram_count+smoothing_parameter
    prob=numerator/denominator
    return prob

### TEST ABOVE FUNCTION
test_token=[['they','all','hate','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
             ['cats','hate','everything' ]]
unigram=create_n_counts(test_token, 1)
bigram=create_n_counts(test_token, 2)
vocab_size=len(create_vocab(test_token, min_freq=2))
estimate_probability('hate',['hate'],unigram, bigram,vocab_size) ## OUTPUT SHOULD BE 0.2 if k=1

0.2

In [24]:
def estimate_all_probabilites(previous_n_gram: list,n_gram_counts:dict, n_plus1_gram_counts:dict, vocabulary:list,start_token='<s>', end_token='<e>',smoothing_parameter=1):
    '''
    The function estimate_probability estimates the probability for a given word and a previous_n_gram, this fuction calculates the 
    probability of all words in our vocabulary for a given previous_n_gram
    '''
    vocabulary=vocabulary+[start_token]+[end_token]
    vocabulary=tuple(vocabulary)
    vocab_size=len(vocabulary)
    probability_dict={}

    for word in vocabulary:
        probability_dict[word]=estimate_probability(word, previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocab_size,smoothing_parameter=smoothing_parameter)
    
    return probability_dict

## Test above function
test_token=[['they','all','hate','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
             ['cats','hate','everything' ]]
unigram=create_n_counts(test_token, 1)
bigram=create_n_counts(test_token, 2)
vocab=create_vocab(test_token, min_freq=2)
estimate_all_probabilites(['hate'],unigram, bigram,vocab)

{'hate': 0.14285714285714285,
 'cats': 0.2857142857142857,
 '<s>': 0.14285714285714285,
 '<e>': 0.14285714285714285}

### Suggest a word

In [25]:
def suggest_a_word(previous_token,n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', start_token='<s>',smoothing_parameter=1,start_with=None):
    '''
    starts_with is an parameter that you could specify that your next word starts with said letter
    Returns : a tuple (word, probability)
    '''
    n=len(list(n_gram_counts.keys())[0]) ## previous token could be larger than n of N_gram, thus finding the value of N
    previous_token=previous_token[-n:]

    probabilities=estimate_all_probabilites(previous_token, n_gram_counts, n_plus1_gram_counts,vocabulary,start_token=start_token,
                                            end_token=end_token,smoothing_parameter=smoothing_parameter)
    suggested_word=None
    max_prob=0

    for word, prob in probabilities.items():
        if start_with!=None:
            if not word.startswith(start_with):
                continue
        if prob>max_prob:
            max_prob=prob
            suggested_word=word
    return suggested_word,max_prob

In [26]:
test_token=[['they','all','hate','cats'], ['cats', 'hate', 'dogs'],['i', 'Love', 'this', 'song'],
             ['cats','hate','everything' ]]
unigram=create_n_counts(test_token, 1)
bigram=create_n_counts(test_token, 2)
vocab=create_vocab(test_token, min_freq=2)
suggest_a_word(['hate'],unigram, bigram,vocab)

('cats', 0.2857142857142857)

### Now create n_grams from twitter data set and output words

In [32]:
## preprocessing twitter data
tokenized_tw_data=tokenize_data(tw_data)
tw_vocab=create_vocab(tokenized_tw_data, min_freq=2)
tw_processed_data=remove_out_of_vocab_words(tokenized_tw_data,tw_vocab)
#######################################

## Creating N-Grams from twitter data ##
tw_unigram=create_n_counts(tw_processed_data,1)
tw_bigram=create_n_counts(tw_processed_data,2)
tw_trigram=create_n_counts(tw_processed_data,3)
tw_quadgram=create_n_counts(tw_processed_data,4)
#########################################
previous_token=['how','to','make']

unigram_suggestion=suggest_a_word(previous_token,tw_unigram, tw_bigram, tw_vocab)
bigram_suggestion=suggest_a_word(previous_token,tw_bigram, tw_trigram, tw_vocab)
trigram_suggestion=suggest_a_word(previous_token, tw_trigram, tw_quadgram,tw_vocab)

print(f'Next word using unigram: {unigram_suggestion[0]} , with probability of {unigram_suggestion[1]}')
print(f'Next word using bigram: {bigram_suggestion[0]} , with probability of {bigram_suggestion[1]} ',)
print(f'Next word using trigram: {trigram_suggestion[0]} , with probability of {trigram_suggestion[1]} ')


Next word using unigram: it , with probability of 0.008687853819853148
Next word using bigram: a , with probability of 0.002569043031470777 
Next word using trigram: a , with probability of 0.000354588972282962 


### Experimenting New trings

test_token=[['dogs', 'love', 'this','song'],['i', 'Love', 'this', 'song','and'],['that','song','makes', 'my','day'],
             ['cats','hate','everything' ],['this','song', 'is', 'crazy'],['this','song','makes','me', 'dance'],['this','song','makes','me','emotional']]

unigram=create_n_counts(test_token, 1)
bigram=create_n_counts(test_token, 2)
trigram=create_n_counts(test_token, 3)
quadgram=create_n_counts(test_token, 4)
fivgram=create_n_counts(test_token, 5)
hexagram=create_n_counts(test_token, 6)
mapping={1:unigram,
2:bigram,
3:trigram,
4:quadgram,
5:fivgram,
6:hexagram
}

vocab=create_vocab(test_token, min_freq=2)
word=None
word_list=[]
prob_list=[]
previous_token=['cold']
count=0
while word!='<e>' and count<10:
    if len(previous_token)==6:
        previous_token=previous_token[-5:]
    initial_gram=mapping[len(previous_token)]
    nex_gram=mapping[len(previous_token)+1]
    word,prob=suggest_a_word(previous_token, bigram,trigram,vocab)
    word_list.append(word)
    prob_list.append(prob)
    if previous_token[-1]==word:
        break
    previous_token+=[word]
    print(previous_token)
    count+=1
    #print(word)

    
    