In [5]:
import spacy
from datasets import load_dataset
import math

In [6]:
nlp = spacy.load("en_core_web_sm")
text = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")

Reusing dataset wikitext (C:\Users\nil34\.cache\huggingface\datasets\wikitext\wikitext-2-raw-v1\1.0.0\a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [7]:
def train_unigram_model():
    total_words = 0
    unigram_model = {}
    for i in range(len(text)):
        if text[i]['text'] != "":
            doc = nlp(text[i]['text'])
            lemmas = []
            for token in doc:
                if token.is_alpha:
                    lemmas.append(token.lemma_)

            for lemma in lemmas:
                unigram_model[lemma] = unigram_model.get(lemma,0) + 1
                total_words += 1
    
    for key in unigram_model:
        unigram_model[key] = math.log(unigram_model[key] / total_words)
    
    return unigram_model

In [8]:
unigram_model = train_unigram_model()

In [10]:
def train_bigram_model():
    bigram_model = {}
    first_words_of_a_bigarm = {}
    
    for i in range(len(text)):
        if text[i]['text'] != "":
            
            doc = nlp(text[i]['text'])
            lemmas = ["START"]
            
            for token in doc:
                if token.is_alpha:
                    lemmas.append(token.lemma_)

            for i in range(len(lemmas) - 1):
                first_word = lemmas[i]
                bigram_key = (lemmas[i],lemmas[i+1]) 
                bigram_model[bigram_key] = bigram_model.get(bigram_key,0) + 1
                first_words_of_a_bigarm[first_word] = first_words_of_a_bigarm.get(first_word,0) + 1

    
    for key in bigram_model:
        bigram_model[key] = math.log(bigram_model[key] / first_words_of_a_bigarm[key[0]])
    
    return bigram_model

In [11]:
bigram_model = train_bigram_model()

In [105]:
def get_the_next_word_for_the_sentence(bigram_model,last_word_of_sentence):
    temp_key = ""
    temp_val = float("-inf")

    for key,val in bigram_model.items():
        if key[0] == last_word_of_sentence and temp_val < val:
            temp_key = key[1]
            temp_val = val
    
    return temp_key

The next word of the sentence "I have a house in" is: the


In [98]:
def compute_probability_of_a_sentence_using_bigram(sentence: str, bigram_model: dict):
    doc = nlp(sentence)
    lemmas = ["START"]
    
    for token in doc:
        if token.is_alpha:
            lemmas.append(token.lemma_)
    
    total_probability = 0
    
    for i in range(len(lemmas) - 1):
        key_in_bigram = (lemmas[i],lemmas[i+1])
        if key_in_bigram in bigram_model:      
            total_probability += bigram_model[key_in_bigram]
        else:
            return float("-inf")
    return total_probability

In [117]:
def calculate_perplexity_using_bigram(corpus,bigram_model):
    probability_sum = 0
    tokens = []
    for line in corpus:
        probability_sum += compute_probability_of_a_sentence_using_bigram(line,bigram_model)
        doc = nlp(line)
        
        for token in doc:
            if token.is_alpha:
                tokens.append(token)
        
    probability_sum /= len(tokens)
    
    return math.exp(-probability_sum)

In [19]:
def compute_probability_of_a_sentence_using_unigram(sentence: str, unigram_model: dict):
    doc = nlp(sentence)
    lemmas = []
    
    for token in doc:
        if token.is_alpha:
            lemmas.append(token.lemma_)
    
    total_probability = 0
    
    for lemma in lemmas:
        if lemma in unigram_model:
            total_probability += unigram_model[lemma]
        else:
            return float("-inf")
    return total_probability

In [85]:
def calculate_interpolated_probability(sentence,bigram_model,unigram_model,weight_bigram,weight_unigram):
    
    doc = nlp(sentence)
    lemmas = ["START"]
    
    result = 0
    
    for token in doc:
        if token.is_alpha:
            lemmas.append(token.lemma_)
            
    for i in range(len(lemmas) - 1):
        key_in_unigram = lemmas[i+1]
        key_in_bigram = (lemmas[i],lemmas[i+1])
        unigram_calculation = 0
        bigram_calculation = 0
        if key_in_unigram in unigram_model:
            unigram_calculation = math.exp(unigram_model[key_in_unigram])*weight_unigram
        if key_in_bigram in bigram_model:
            bigram_calculation = math.exp(bigram_model[key_in_bigram])*weight_bigram
        calculation = bigram_calculation + unigram_calculation
        result += math.log(calculation)
    
    return result

In [90]:
def calculate_perplexity_using_interpolated_model(corpus,bigram_model,unigram_model,weight_bigram,weight_unigram):
    probability_sum = 0
    tokens = []
    for line in corpus:
        probability_sum += calculate_interpolated_probability(line,bigram_model,unigram_model,weight_bigram,weight_unigram)
        doc = nlp(line)

        for token in doc:
            if token.is_alpha:
                tokens.append(token)
        
    probability_sum /= len(tokens)
    return math.exp(-probability_sum)

269.81031430478953

In [122]:
first_sentence = "Brad Pitt was born in Oklahoma"
second_sentence = "The actor was born in USA"
corpus = ["Brad Pitt was born in Oklahoma","The actor was born in USA"]

print('The prediction of the next word for the sentence "I have a house in" is: ' + get_the_next_word_for_the_sentence(bigram_model,"in"))
print("The probability for the sentence " + first_sentence + " is: " + (str)(compute_probability_of_a_sentence_using_bigram(first_sentence,bigram_model)))
print("The probability for the sentence " + second_sentence + " is: " + (str)(compute_probability_of_a_sentence_using_bigram(second_sentence,bigram_model)))
print("The perplexity of both the sentences is: " + (str)(calculate_perplexity_using_bigram(corpus,bigram_model)))
print("The probability of the first sentence using the new model is: "+ (str)(calculate_interpolated_probability(first_sentence,bigram_model,unigram_model,2/3,1/3)))
print("The probability of the second sentence using the new model is: " + (str)(calculate_interpolated_probability(second_sentence,bigram_model,unigram_model,2/3,1/3)))
print("The perplexity of both the sentences with the new model is: " + (str)(calculate_perplexity_using_interpolated_model(corpus,bigram_model,unigram_model,2/3,1/3)))

The next word of the sentence "I have a house in" is: the
The probability for the sentence Brad Pitt was born in Oklahoma is: -inf
The probability for the sentence The actor was born in USA is: -29.686567347483418
The perplexity of both the sentences is: inf
The probability of the first sentence using the new model is: -36.176302610738425
The probability of the second sentence using the new model is: -30.996327459140225
The perplexity of both the sentences with the new model is: 269.81031430478953
