# CS6320: Assignment 1

#### Group - 32

#### Authors:
 - Utkarsh Farkya - uxf220000
 - Sai Nikhil Vaddhi - sxv210095

# Importing modules for data preprocessing and storage

In [1]:
import spacy
import pandas as pd
import numpy as np
import urllib
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download("wordnet")
import string
from collections import defaultdict, Counter
import math

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#### Intializing lemmatizer and punctuations constants

In [2]:
punctuations = string.punctuation
wordnet_lemmatizer = WordNetLemmatizer()

## Retreiving training and validation data from the URLs

In [3]:
# Dataset URLs
TRAINING_SET_URL = "https://gitlab.com/utkarshfr/nlp-fall2023/-/raw/main/A1-dataset/train.txt"
VALIDATION_SET_URL = "https://gitlab.com/utkarshfr/nlp-fall2023/-/raw/main/A1-dataset/val.txt"

# Get raw training data from the URL
raw_training_data = urllib.request.urlopen(TRAINING_SET_URL).read().decode()
raw_training_data_sentences = raw_training_data.split("\n")
raw_training_data_sentences.pop()

# Get test data from the URL
raw_validation_data = urllib.request.urlopen(VALIDATION_SET_URL).read().decode()
raw_validation_data_sentences = raw_validation_data.split("\n")

raw_sample_data = "the students like the assignment"

In [44]:
print("---RAW TRAINING DATA---")
raw_training_data[:1000]

---RAW TRAINING DATA---


'I booked two rooms four months in advance at the Talbott . We were placed on the top floor next to the elevators , which are used all night long . When speaking to the front desk , I was told that they were simply honoring my request for an upper floor , which I had requested for a better view . I am looking at a brick wall , and getting no sleep . He also told me that they had received complaints before from guests on the 16th floor , and were aware of the noise problem . Why then did they place us on this floor when the hotel is not totally booked ? A request for an upper floor does not constitute placing someone on the TOP floor and using that request to justify this . If you decide to stay here , request a room on a lower floor and away from the elevator ! I spoke at length when booking my two rooms about my preferences . This is simply poor treatment of a guest whom they believed would not complain .\nI LOVED this hotel . The room was so chic and trendy , the bed was comfortable 

In [45]:
print("---RAW VALIDATION DATA---")
raw_validation_data[:1000]

---RAW VALIDATION DATA---


"I stayed for four nights while attending a conference . The hotel is in a great spot - easy walk to Michigan Ave shopping or Rush St. , but just off the busy streets . The room I had was spacious , and very well-appointed . The staff was friendly , and the fitness center , while not huge , was well-equipped and clean . I 've stayed at a number of hotels in Chicago , and this one is my favorite . Internet was n't free , but at $ 10 for 24 hours is cheaper than most business hotels , and it worked very well .\nwe love the location and proximity to everything . The staff was very friendly and courteous . They were so nice to our 2.5 year old boy . got his backpack full of goodies the moment we arrived . We got free wifi and morning drinks by signing up for select guest program . Ca n't beat that ! the only minor issue is the elevator . we have to take 2 separate elevator trips to get to our room . It got a little annoying when we were going in and out often . Otherwise , it was a great s

## Preprocessing training and validation data

In [6]:
# Even though tokenized, contains words like 've, 'd which should be removed during preprocessing

def tokenize_data(data):
    sentence = word_tokenize(data)
    # Adding <start> and <stop> token at the beginning and end of the sentence
    sentence.insert(0, "<start>")
    sentence.append("<stop>")
    return sentence

def preprocess_sentence(data):
    data = tokenize_data(data)
    # removing punctuations, words starting with punctuations, and stopwords from the corpus
    data = [token.lower() for token in data if (token =="<start>" or token == "<stop>") or  (token not in punctuations and token[0] not in punctuations and token not in stopwords.words("english"))]
    # lemmatizing the words to get their root word, in most cases does not change th meaning the sentence
    # helps in focusing on important words
    data = [wordnet_lemmatizer.lemmatize(token) for token in data]
    return data

def preprocess_corpus(data, process_sentences = False):
    if process_sentences:
        data = [preprocess_sentence(e) for e in data]
        return data
    return preprocess_sentence(data)

def flatten_sentences(data):
    return [word for sentence in data for word in sentence]

train_data_sentences = preprocess_corpus(raw_training_data_sentences, True)
validation_data_sentences = preprocess_corpus(raw_validation_data_sentences, True)
train_data = flatten_sentences(train_data_sentences)
validation_data = flatten_sentences(validation_data_sentences)

In [7]:
print("---PREPROCESSED TRAINING DATA---")
print(train_data)

---PREPROCESSED TRAINING DATA---


In [8]:
print("---PREPROCESSED VALIDATION DATA---")
print(validation_data)

---PREPROCESSED VALIDATION DATA---


## Unigram Model Implementation

In [10]:
class Unigram:
    def __init__(self, data, smoothing_technique = "k-smooth", k = 0):
        self.data = data
        self.word_count  = defaultdict(int)
        self.probabilities = defaultdict(float)
        self.total_word_count = 0
        self.smoothing_technique = smoothing_technique
        self.unique_words = set()
        self.k = k

    def count_words_from_corpus(self):
        self.word_count = defaultdict(int)
        self.total_word_count = 0
        for e in self.data:
            self.word_count[e] += 1
            self.total_word_count += 1

    def calculate_probabilties(self):
        self.probabilities = defaultdict(float, {key: value / self.total_word_count for key, value in self.word_count.items()})

    def train(self):
        self.count_words_from_corpus()
        self.calculate_probabilties()
        self.unique_words = set(self.word_count.keys())

    def smooth(self):
        # Updating probabilities of the word already tracked by the model
        if self.smoothing_technique == "k-smooth":
            for e in self.word_count:
                self.probabilities[e] = (self.word_count[e] + self.k) / (self.total_word_count + (self.k * len(self.unique_words)))

        elif self.smoothing_technique == "laplace":
            for e in self.word_count:
                self.probabilities[e] = (self.word_count[e] + 1) / (self.total_word_count + len(self.unique_words))

    def handle_zero_probability(self, token):
        if self.smoothing_technique == "k-smooth":
            return (self.word_count[token] + self.k) / (self.total_word_count + (self.k * len(self.unique_words)))

        elif self.smoothing_technique == "laplace":
            return (self.word_count[token] + 1) / (self.total_word_count + len(self.unique_words))

        elif self.smoothing_technique == "unk":
            return self.word_count["<unk>"] / self.total_word_count

    def handle_unknown(self, n = 1):
        unk_words = [key for key, val in self.word_count.items() if val <= n]
        self.data = ["<unk>" if e in unk_words else e for e in self.data]
        self.train()

    def perplexity(self, data):
        total_log_val = 0
        for word in data:
            if word in self.unique_words:
                total_log_val += math.log2(self.probabilities[word])
            else:
                total_log_val += self.handle_zero_probability(word)

        total_log_val /= len(data)
        return 2 ** -total_log_val

## Bigram Model Implementation

In [11]:
class Bigram:
    def __init__(self, data, smoothing_technique = "k-smooth", k = 0):
        self.data = data
        self.word_count = defaultdict(int)
        self.bigram_word_count = defaultdict(int)
        self.probabilties = defaultdict(float)
        self.total_word_count = 0
        self.smoothing_technique = smoothing_technique
        self.k = k
        self.unique_words = set()
        self.unique_bigram_words = set()

    def count_words_from_corpus(self):
        self.word_count.clear()
        self.total_word_count = 0
        self.bigram_word_count.clear()

        for sentence in self.data:
            for word in sentence:
                self.word_count[word] += 1
                self.total_word_count += 1

        for sentence in self.data:
            for bigram in zip(sentence, sentence[1:]):
                if bigram == ("they", "wo"):
                    print(True)
                self.bigram_word_count[bigram] += 1

    def calculate_probabilties(self):
        self.probabilties = defaultdict(float, {key: self.bigram_word_count[(key[0], key[1])] / self.word_count[key[0]] for key, value in self.bigram_word_count.items()})

    def train(self):
        self.count_words_from_corpus()
        self.calculate_probabilties()
        self.unique_words = set(self.word_count.keys())
        self.unique_bigram_words = set(self.bigram_word_count.keys())

    def handle_zero_probability(self, token):
        if self.smoothing_technique == "k-smooth":
            return (self.bigram_word_count[token] + self.k) / (self.word_count[token[0]] + (self.k * len(self.unique_words)))

        elif self.smoothing_technique == "laplace":
            return (self.bigram_word_count[token] + 1) / (self.word_count[token[0]] + len(self.unique_words))

        elif self.smoothing_technique == "unk":
            return self.bigram_word_count[("<unk>", "<unk>")] / self.word_count["<unk>"]

    def handle_unknown(self, n = 1):
        unk_words = [key for key, val in self.word_count.items() if val <= n]
        self.data = [["<unk>" if word in unk_words else word for word in sentence] for sentence in self.data]
        self.train()

    def smooth(self):
        if self.smoothing_technique == "k-smooth":
            for e in self.bigram_word_count:
                self.probabilties[e] = (self.bigram_word_count[e] + self.k) / (self.word_count[e[0]] + (self.k * len(self.unique_words)))
        elif self.smoothing_technique == "laplace":
            for e in self.bigram_word_count:
                self.probabilties[e] = (self.bigram_word_count[e] + 1) / (self.word_count[e[0]] + len(self.unique_words))

    def perplexity(self, data):
        total_log_prob = 0.0
        for sentence in data:
            for e in zip(sentence, sentence[1:]):
                if e in self.unique_bigram_words:
                    total_log_prob += math.log2(self.probabilties[e])
                else:
                    total_log_prob += self.handle_zero_probability(e)
        total_log_prob = total_log_prob / sum([len(sentence) for sentence in data])
        return 2 ** -total_log_prob

## Perplexity of Unigrams on training set
- Unsmoothed Unigram
- \<unk> word handled Unigram
- Laplace Smoothed Unigram
- Add_k_smoothed Unigram
    - k = 0.5
    - k = 0.1
    - k = 0.05
    - k = 0.01


In [12]:
# Unsmoothed Unigram on training set
unigram_train = Unigram(train_data)
unigram_train.train()
print("Perplexity of Unsmoothed Unigram model on training set: ",unigram_train.perplexity(train_data))

Perplexity of Unsmoothed Unigram model on training set:  1069.5292498671135


In [13]:
# Unknown words handled Unigram on training set
unigram_unk_train = Unigram(train_data, "unk")
unigram_unk_train.train()
unigram_unk_train.handle_unknown(n = 1)
print("Perplexity of Unknown words handled Unigram model on training set: ",unigram_unk_train.perplexity(train_data))

Perplexity of Unknown words handled Unigram model on training set:  542.7438202518388


In [14]:
# Laplace smoothed Unigram on training set
unigram_laplace_train = Unigram(train_data, "laplace")
unigram_laplace_train.train()
unigram_laplace_train.smooth()
print("Perplexity of Laplace smoothed Unigram model on training set: ",unigram_laplace_train.perplexity(train_data))

Perplexity of Laplace smoothed Unigram model on training set:  1089.4927440212678


In [15]:
# Add-k-smoothed Unigram on training set with k = 0.5
unigram_k_smooth_train = Unigram(train_data, "k-smooth", 0.5)
unigram_k_smooth_train.train()
unigram_k_smooth_train.smooth()
print("Perplexity of Add-k-smoothed Unigram model on training set with k = 0.5: ",unigram_k_smooth_train.perplexity(train_data))

Perplexity of Add-k-smoothed Unigram model on training set with k = 0.5:  1075.7920268617872


In [16]:
# Add-k-smoothed Unigram on training set with k = 0.1
unigram_k_smooth_train_2 = Unigram(train_data, "k-smooth", 0.1)
unigram_k_smooth_train_2.train()
unigram_k_smooth_train_2.smooth()
print("Perplexity of Add-k-smoothed Unigram model on training set with k = 0.1: ",unigram_k_smooth_train_2.perplexity(train_data))

Perplexity of Add-k-smoothed Unigram model on training set with k = 0.1:  1069.8447365637267


In [17]:
# Add-k-smoothed Unigram on training set with k = 0.05
unigram_k_smooth_train_3 = Unigram(train_data, "k-smooth", 0.05)
unigram_k_smooth_train_3.train()
unigram_k_smooth_train_3.smooth()
print("Perplexity of Add-k-smoothed Unigram model on training set with k = 0.05: ",unigram_k_smooth_train_3.perplexity(train_data))

Perplexity of Add-k-smoothed Unigram model on training set with k = 0.05:  1069.6107982808858


In [18]:
# Add-k-smoothed Unigram on training set with k = 0.01
unigram_k_smooth_train_4 = Unigram(train_data, "k-smooth", 0.01)
unigram_k_smooth_train_4.train()
unigram_k_smooth_train_4.smooth()
print("Perplexity of Add-k-smoothed Unigram model on training set with k = 0.01: ",unigram_k_smooth_train_4.perplexity(train_data))

Perplexity of Add-k-smoothed Unigram model on training set with k = 0.01:  1069.5326031837844


## Perplexity of Bigrams on training set
- Unsmoothed Bigram
- \<unk> word handled Bigram
- Laplace Smoothed Bigram
- Add_k_smoothed Bigram
    - k = 0.5
    - k = 0.1
    - k = 0.05
    - k = 0.01


In [19]:
# Unsmoothed Bigram on training set
bigram_train = Bigram(train_data_sentences)
bigram_train.train()
print("Perplexity of Unsmoothed Bigram model on training set: ",bigram_train.perplexity(train_data_sentences))

Perplexity of Unsmoothed Bigram model on training set:  20.699652185270793


In [20]:
# Unknown words handled Bigram on training set
bigram_unk_train = Bigram(train_data_sentences, "unk")
bigram_unk_train.train()
bigram_unk_train.handle_unknown(n = 1)
print("Perplexity of Unknown words handled Bigram model on training set: ", bigram_unk_train.perplexity(train_data_sentences))

Perplexity of Unknown words handled Bigram model on training set:  16.78079536741834


In [21]:
# Laplace smoothed Bigram on training set
bigram_laplace_train = Bigram(train_data_sentences, "laplace")
bigram_laplace_train.train()
bigram_laplace_train.smooth()
print("Perplexity of Laplace smoothed Bigram model on training set: ", bigram_laplace_train.perplexity(train_data_sentences))

Perplexity of Laplace smoothed Bigram model on training set:  1667.735448912925


In [22]:
# Add-k-smoothed Bigram on training set with k = 0.5
bigram_add_k_train = Bigram(train_data_sentences, "k-smooth", 0.5)
bigram_add_k_train.train()
bigram_add_k_train.smooth()
print("Perplexity of Add-k-smoothed Bigram model on training set with k = 0.5: ", bigram_add_k_train.perplexity(train_data_sentences))

Perplexity of Add-k-smoothed Bigram model on training set with k = 0.5:  1074.6482403803805


In [23]:
# Add-k-smoothed Bigram on training set with k = 0.1
bigram_add_k_train_2 = Bigram(train_data_sentences, "k-smooth", 0.1)
bigram_add_k_train_2.train()
bigram_add_k_train_2.smooth()
print("Perplexity of Add-k-smoothed Bigram model on training set with k = 0.1: ", bigram_add_k_train_2.perplexity(train_data_sentences))

Perplexity of Add-k-smoothed Bigram model on training set with k = 0.1:  324.37033322826727


In [24]:
# Add-k-smoothed Bigram on training set with k = 0.05
bigram_add_k_train_3 = Bigram(train_data_sentences, "k-smooth", 0.05)
bigram_add_k_train_3.train()
bigram_add_k_train_3.smooth()
print("Perplexity of Add-k-smoothed Bigram model on training set with k = 0.05: ", bigram_add_k_train_3.perplexity(train_data_sentences))

Perplexity of Add-k-smoothed Bigram model on training set with k = 0.05:  193.7987065845242


In [25]:
# Add-k-smoothed Bigram on training set with k = 0.01
bigram_add_k_train_4 = Bigram(train_data_sentences, "k-smooth", 0.01)
bigram_add_k_train_4.train()
bigram_add_k_train_4.smooth()
print("Perplexity of Add-k-smoothed Bigram model on training set with k = 0.01:", bigram_add_k_train_4.perplexity(train_data_sentences))

Perplexity of Add-k-smoothed Bigram model on training set with k = 0.01: 69.65575369691636


## Perplexity of Unigrams on validation set
- \<unk> word handled Unigram
- Laplace Smoothed Unigram
- Add_k_smoothed Unigram
    - k = 0.5
    - k = 0.1
    - k = 0.05
    - k = 0.01


In [26]:
# Unknown words handled Unigram on training set
unigram_unk_val = Unigram(train_data, "unk")
unigram_unk_val.train()
unigram_unk_val.handle_unknown(n = 1)
print("Perplexity of Unknown words handled Unigram model on validation set: ",unigram_unk_val.perplexity(validation_data))

Perplexity of Unknown words handled Unigram model on validation set:  396.5196287422726


In [27]:
# Laplace smoothed Unigram on training set
unigram_laplace_val = Unigram(train_data, "laplace")
unigram_laplace_val.train()
unigram_laplace_val.smooth()
print("Perplexity of Laplace smoothed Unigram model on validation set: ",unigram_laplace_val.perplexity(validation_data))

Perplexity of Laplace smoothed Unigram model on validation set:  601.440045631427


In [28]:
# Add-k-smoothed Unigram on training set with k = 0.5
unigram_k_smooth_val = Unigram(train_data, "k-smooth", 0.5)
unigram_k_smooth_val.train()
unigram_k_smooth_val.smooth()
print("Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.5: ",unigram_k_smooth_val.perplexity(validation_data))

Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.5:  588.4087698679492


In [29]:
# Add-k-smoothed Unigram on training set with k = 0.1
unigram_k_smooth_val_2 = Unigram(train_data, "k-smooth", 0.1)
unigram_k_smooth_val_2.train()
unigram_k_smooth_val_2.smooth()
print("Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.1: ",unigram_k_smooth_val_2.perplexity(validation_data))

Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.1:  579.3317189118854


In [30]:
# Add-k-smoothed Unigram on training set with k = 0.05
unigram_k_smooth_val_3 = Unigram(train_data, "k-smooth", 0.05)
unigram_k_smooth_val_3.train()
unigram_k_smooth_val_3.smooth()
print("Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.05: ",unigram_k_smooth_val_3.perplexity(validation_data))

Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.05:  578.3382924081207


In [31]:
# Add-k-smoothed Unigram on training set with k = 0.01
unigram_k_smooth_val_4 = Unigram(train_data, "k-smooth", 0.01)
unigram_k_smooth_val_4.train()
unigram_k_smooth_val_4.smooth()
print("Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.01: ",unigram_k_smooth_val_4.perplexity(validation_data))

Perplexity of Add-k-smoothed Unigram model on validation set with k = 0.01:  577.5740554105223


## Perplexity of Bigrams on validation set
- \<unk> word handled Bigram
- Laplace Smoothed Bigram
- Add_k_smoothed Bigram
    - k = 0.5
    - k = 0.1
    - k = 0.05
    - k = 0.01


In [32]:
# Unknown words handled Bigram on validation set
bigram_unk_val = Bigram(train_data_sentences, "unk")
bigram_unk_val.train()
bigram_unk_val.handle_unknown(n = 1)
print("Perplexity of Unknown words handled Bigram model on validation set: ", bigram_unk_val.perplexity(validation_data_sentences))

Perplexity of Unknown words handled Bigram model on validation set:  3.51489086085866


In [33]:
# Laplace smoothed Bigram on validation set
bigram_laplace_val = Bigram(train_data_sentences, "laplace")
bigram_laplace_val.train()
bigram_laplace_val.smooth()
print("Perplexity of Unknown words handled Bigram model on validation set: ", bigram_laplace_val.perplexity(validation_data_sentences))

Perplexity of Unknown words handled Bigram model on validation set:  14.613967735058832


In [34]:
# Add-k-smoothed Bigram on validation set with k = 0.5
bigram_add_k_val = Bigram(train_data_sentences, "k-smooth", 0.5)
bigram_add_k_val.train()
bigram_add_k_val.smooth()
print("Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.5: ", bigram_add_k_val.perplexity(validation_data_sentences))

Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.5:  12.032963366017078


In [35]:
# Add-k-smoothed Bigram on validation set with k = 0.1
bigram_add_k_val_2 = Bigram(train_data_sentences, "k-smooth", 0.1)
bigram_add_k_val_2.train()
bigram_add_k_val_2.smooth()
print("Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.1: ", bigram_add_k_val_2.perplexity(validation_data_sentences))

Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.1:  7.605518902893464


In [38]:
# Add-k-smoothed Bigram on validation set with k = 0.05
bigram_add_k_train_3 = Bigram(train_data_sentences, "k-smooth", 0.05)
bigram_add_k_train_3.train()
bigram_add_k_train_3.smooth()
print("Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.05: ", bigram_add_k_train_3.perplexity(validation_data_sentences))

Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.05:  6.39367695892385


In [39]:
# Add-k-smoothed Bigram on validation set with k = 0.01
bigram_add_k_val_4 = Bigram(train_data_sentences, "k-smooth", 0.01)
bigram_add_k_val_4.train()
bigram_add_k_val_4.smooth()
print("Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.01: ", bigram_add_k_val_4.perplexity(validation_data_sentences))

Perplexity of Add-k-smoothed Bigram model on validation set with k = 0.01:  4.731323620901083
