In [None]:
# LOADING

In [None]:
#mounted new google drive - train and val dataset are available in content/drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#reading the dataset
def read_corpus(file_path):
    with open(file_path, 'r') as file:
        #readlines() returns a list where each line is an element in the list.
        reviews = file.readlines()
    return reviews

train_data = read_corpus("/content/drive/MyDrive/A1_DATASET/train.txt")
validation_data = read_corpus("/content/drive/MyDrive/A1_DATASET/val.txt")


In [None]:
#reading the dataset
def read_corpus(file_path):
    with open(file_path, 'r', encoding='latin-1') as file: # Changed encoding to latin-1
        #readlines() returns a list where each line is an element in the list.
        reviews = file.readlines()
    return reviews

train_data = read_corpus("/content/drive/MyDrive/A1_DATASET/train.txt")
validation_data = read_corpus("/content/drive/MyDrive/A1_DATASET/val.txt")

In [None]:
#prints the first 5 record in the corpus. Did this just to make sure the datset was being read right.
print(train_data[:5])
print(validation_data[:5])

['I booked two rooms four months in advance at the Talbott . We were placed on the top floor next to the elevators , which are used all night long . When speaking to the front desk , I was told that they were simply honoring my request for an upper floor , which I had requested for a better view . I am looking at a brick wall , and getting no sleep . He also told me that they had received complaints before from guests on the 16th floor , and were aware of the noise problem . Why then did they place us on this floor when the hotel is not totally booked ? A request for an upper floor does not constitute placing someone on the TOP floor and using that request to justify this . If you decide to stay here , request a room on a lower floor and away from the elevator ! I spoke at length when booking my two rooms about my preferences . This is simply poor treatment of a guest whom they believed would not complain .\n', "I LOVED this hotel . The room was so chic and trendy , the bed was comfort

In [None]:
#these are all libraries necesary for preprocessing: stop word removal and lemmatization.
import re
import string
!pip install nltk
import nltk



# PRE-PROCESSING

In [None]:
#function for removing stop words which will be evoked in "preprocess_and_tokenize" function
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#function for lemmatization which will be evoked in "preprocess_and_tokenize" function
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#includes all pre processing steps and also calls function for stop words removal and lemmatization
def preprocess_and_tokenize(reviews):
    processed_reviews = []

    for review in reviews:
        review = review.lower()
        review = re.sub(f"[{re.escape(string.punctuation)}]", "", review)
        tokens = review.split()
        tokens = remove_stopwords(tokens)
        tokens = lemmatize(tokens)
        processed_reviews.append(tokens)

    return processed_reviews

In [None]:
#to check whether pre processing works
tokenized_reviews_train = preprocess_and_tokenize(train_data)

print(tokenized_reviews_train[:5])


[['booked', 'two', 'room', 'four', 'month', 'advance', 'talbott', 'placed', 'top', 'floor', 'next', 'elevator', 'used', 'night', 'long', 'speaking', 'front', 'desk', 'told', 'simply', 'honoring', 'request', 'upper', 'floor', 'requested', 'better', 'view', 'looking', 'brick', 'wall', 'getting', 'sleep', 'also', 'told', 'received', 'complaint', 'guest', '16th', 'floor', 'aware', 'noise', 'problem', 'place', 'u', 'floor', 'hotel', 'totally', 'booked', 'request', 'upper', 'floor', 'constitute', 'placing', 'someone', 'top', 'floor', 'using', 'request', 'justify', 'decide', 'stay', 'request', 'room', 'lower', 'floor', 'away', 'elevator', 'spoke', 'length', 'booking', 'two', 'room', 'preference', 'simply', 'poor', 'treatment', 'guest', 'believed', 'would', 'complain'], ['loved', 'hotel', 'room', 'chic', 'trendy', 'bed', 'comfortable', 'great', 'slipper', 'robe', 'love', 'keihl', 'bath', 'product', 'bathroom', 'went', 'birthday', 'weekend', 'card', 'plate', 'pastry', 'waiting', 'room', 'got', 

In [None]:
print("Training data has ",len(train_data)," sentences")
print("Testing data has ",len(validation_data)," sentences")

Training data has  512  sentences
Testing data has  70  sentences


# Unigram and Bigram Probabilities

## Unigrams

In [None]:
 #this function is for finding unigrams
from collections import defaultdict

def unigram_counts(tokenized_reviews):
    unigram_count = defaultdict(int)
    total_tokens = 0

    for review in tokenized_reviews:
        for token in review:
            unigram_count[token] += 1
            total_tokens += 1

    return unigram_count, total_tokens

unigram_count, total_tokens = unigram_counts(tokenized_reviews_train)


In [None]:
#using above function, calculated the probability by dividing by total number of tokens
def unigram_probabilities(unigram_count, total_tokens):
    unigram_probs = {word: count / total_tokens for word, count in unigram_count.items()}
    return unigram_probs


unigram_probs = unigram_probabilities(unigram_count, total_tokens)


In [None]:
#output looks good for unigram by showing the needed frequencies and probability.
print(unigram_count)
print(unigram_probs)



In [None]:
import pandas as pd

# Convert unigram_counts to a DataFrame
unigram_df = pd.DataFrame(unigram_count.items(), columns=['Unigram', 'Count'])

# Sort the DataFrame by count in descending order
sorted_unigram_df = unigram_df.sort_values(by='Count', ascending=False)

print("Sorted Unigram DataFrame:")
print("Total words in the corpus after cleaning: ", sorted_unigram_df['Count'].sum() )
sorted_unigram_df

Sorted Unigram DataFrame:
Total words in the corpus after cleaning:  39589


Unnamed: 0,Unigram,Count
41,hotel,1143
2,room,1130
49,stay,416
66,great,353
111,nt,336
...,...,...
3377,tap,1
3378,soaked,1
3379,reflective,1
3381,waived,1


In [None]:
sorted_unigram_df.head(10)

Unnamed: 0,Unigram,Count
41,hotel,1143
2,room,1130
49,stay,416
66,great,353
111,nt,336
90,chicago,328
59,would,327
13,night,279
188,staff,268
97,service,264


In [None]:
# Calculate the cumulative sum of counts
sorted_unigram_df['Cumulative Count'] = sorted_unigram_df['Count'].cumsum()
# Calculate the total number of words
total_count = sorted_unigram_df['Count'].sum()

# Calculate the cumulative percentage
sorted_unigram_df['Cumulative Percentage'] = (sorted_unigram_df['Cumulative Count'] / total_count) * 100

# For 80% frequency
threshold_80 = sorted_unigram_df[sorted_unigram_df['Cumulative Percentage'] <= 80]
num_words_80 = len(threshold_80)

print(f"Number of words contributing to 80% of total occurrences: {num_words_80} out of {sorted_unigram_df.shape[0]}")

# 80-20 rule is being followed, 20% words account for 80% of all occurences

Number of words contributing to 80% of total occurrences: 1059 out of 5537


## Bigrams

In [None]:
#did similar function for bigrams
def bigram_counts(tokenized_reviews):
    bigram_count = defaultdict(int)

    for review in tokenized_reviews:
        for i in range(len(review) - 1):
            bigram = (review[i], review[i+1])
            bigram_count[bigram] += 1

    return bigram_count

# Example usage
bigram_count = bigram_counts(tokenized_reviews_train)

In [None]:
#probability of bigrams was also found the same way as done for unigrams
def bigram_probabilities(bigram_count, unigram_count):
    bigram_probs = {bigram: count / unigram_count[bigram[0]] for bigram, count in bigram_count.items()}
    return bigram_probs

# Example usage
bigram_probs = bigram_probabilities(bigram_count, unigram_count)

In [None]:
print(bigram_count)
print(bigram_probs)



### Bigram Data Analysis

In [None]:
# Convert bigram_counts to a DataFrame
bigram_df = pd.DataFrame(bigram_count.items(), columns=['Bigram', 'Count'])

# Sort the DataFrame by count in descending order
sorted_bigram_df = bigram_df.sort_values(by='Count', ascending=False)

print("Sorted Bigram DataFrame:")
print(sorted_bigram_df)

Sorted Bigram DataFrame:
                  Bigram  Count
16         (front, desk)    102
2294     (room, service)     63
76         (hotel, room)     41
224      (michigan, ave)     36
1581       (stay, hotel)     34
...                  ...    ...
11551  (love, chocolate)      1
11550        (got, love)      1
11549       (mmmhh, got)      1
11548   (putting, mmmhh)      1
30735  (advertise, room)      1

[30736 rows x 2 columns]


In [None]:
sorted_bigram_df.head(15)

Unnamed: 0,Bigram,Count
16,"(front, desk)",102
2294,"(room, service)",63
76,"(hotel, room)",41
224,"(michigan, ave)",36
1581,"(stay, hotel)",34
1444,"(could, nt)",33
80,"(bed, comfortable)",32
112,"(recommend, hotel)",32
1935,"(great, location)",31
2148,"(room, clean)",30


In [None]:
# Calculate the cumulative sum of counts
sorted_bigram_df['Cumulative Count'] = sorted_bigram_df['Count'].cumsum()
# Calculate the total number of words
total_count_bigram = sorted_bigram_df['Count'].sum()

# Calculate the cumulative percentage
sorted_bigram_df['Cumulative Percentage'] = (sorted_bigram_df['Cumulative Count'] / total_count_bigram) * 100

# For 80% frequency
threshold_40 = sorted_bigram_df[sorted_bigram_df['Cumulative Percentage'] <= 40]
num_words_40 = len(threshold_40)

print(f"Number of words contributing to 40% of total occurrences: {num_words_40} out of {sorted_bigram_df.shape[0]}")

# 20% words account for 40% of all occurences

Number of words contributing to 40% of total occurrences: 7289 out of 30736


# Smoothing

In [None]:
# Lets preprocess the test set first

processed_test_reviews = preprocess_and_tokenize(validation_data)



## Unkown Words

In [None]:
from collections import Counter
# Step 1: Create a dictionary for test unigrams
unigram_counts_test = Counter(word for review in processed_test_reviews for word in review)
print("Total unique words in unigram: ",len(unigram_counts_test),"\n")


# Step 2: Convert to DataFrame
unigram_df_test = pd.DataFrame(unigram_counts_test.items(), columns=['Unigram', 'Count'])

print("Total word occurences in unigram: ",unigram_df_test['Count'].sum(),"\n")

# Step 3: Add Probability Column
def get_probability(word, unigram_counts):
  return unigram_counts.get(word, 0)  # Use 'N/A' for unknown words

unigram_df_test['Probability'] = unigram_df_test['Unigram'].apply(lambda word: get_probability(word, unigram_probs))

print("Test Unigram DataFrame with Probabilities:")
unigram_df_test.sort_values(by='Count',ascending=False)

Total unique words in unigram:  1544 

Total word occurences in unigram:  4407 

Test Unigram DataFrame with Probabilities:


Unnamed: 0,Unigram,Count,Probability
17,room,133,0.028543
5,hotel,107,0.028872
79,stay,53,0.010508
6,great,50,0.008917
2,night,47,0.007047
...,...,...,...
899,ended,1,0.000303
900,20th,1,0.000051
901,nicer,1,0.000278
902,yesterday,1,0.000025


In [None]:
news_words = unigram_df_test[unigram_df_test['Probability'] == 0]
news_words.sort_values(by='Count',ascending=False)

Unnamed: 0,Unigram,Count,Probability
863,whistle,3,0.0
1177,hail,3,0.0
1461,hit,2,0.0
704,250,2,0.0
413,gibson,2,0.0
...,...,...,...
857,350night,1,0.0
864,summon,1,0.0
868,615,1,0.0
869,630,1,0.0


In [None]:
news_words

Unnamed: 0,Unigram,Count,Probability
25,wellequipped,1,0.0
81,travelwarning,1,0.0
96,consists,1,0.0
99,bisquit,1,0.0
101,moreover,1,0.0
...,...,...,...
1532,unplugged,1,0.0
1537,41709,1,0.0
1538,42009,1,0.0
1539,chechecked,1,0.0


In [None]:
print("New words unique count is :",news_words.shape[0],"\nThats a percentage of the total word count : ",news_words['Count'].shape[0]/unigram_df_test.shape[0]*100)
print("New words count is :",news_words['Count'].sum(),"\nThats a percentage of the total word count : ",news_words['Count'].sum()/unigram_df_test['Count'].sum()*100)

New words unique count is : 272 
Thats a percentage of the total word count :  17.616580310880828
New words count is : 284 
Thats a percentage of the total word count :  6.444293169956887


Maximum occurences are 3
17% words are new , however it is only 6% of the total word count

Unnamed: 0,Unigram,Count
0,booked,86
1,two,128
2,room,1130
3,four,20
4,month,15
...,...,...
5532,stirrer,1
5533,10yo,1
5534,yahoo,1
5535,guarantee,1


## Smoothing technqiues

In [None]:
vocab_size = len(unigram_count)

train_counts = unigram_df['Count'].sum()

# Function to apply smoothing
def apply_smoothing(df, unigram_count, vocab_size, k):
  total_train_count = sum(unigram_count.values())
  df['Smoothed Probability'] = df['Unigram'].apply(
      lambda word: (unigram_count.get(word, 0) + k) / (train_counts + k * vocab_size)
  )
  return df


# Apply smoothing for different values of k
k_values = [1, 0.5, 2, 5, 20]
smoothed_dfs = {}

for k in k_values:
  smoothed_df = apply_smoothing(unigram_df_test.copy(), unigram_count, vocab_size, k)
  smoothed_dfs[k] = smoothed_df
  print(f"\nSmoothed probabilities with k={k}:")
  print(smoothed_df[['Unigram', 'Smoothed Probability']])


Smoothed probabilities with k=1:
          Unigram  Smoothed Probability
0          stayed              0.005163
1            four              0.000465
2           night              0.006205
3       attending              0.000155
4      conference              0.000953
...           ...                   ...
1539   chechecked              0.000022
1540          915              0.000022
1541           15              0.000510
1542       marked              0.000044
1543  improvement              0.000044

[1544 rows x 2 columns]

Smoothed probabilities with k=0.5:
          Unigram  Smoothed Probability
0          stayed              0.005489
1            four              0.000484
2           night              0.006599
3       attending              0.000153
4      conference              0.001003
...           ...                   ...
1539   chechecked              0.000012
1540          915              0.000012
1541           15              0.000531
1542       marked        

In [None]:
# |def laplace_smoothing(bigram_count, unigram_count, vocabulary_size):
#   bigram_probs = {}
#   for bigram, count in bigram_count.items():
#       bigram_probs[bigram] = (count + 1) / (unigram_count[bigram[0]] + vocabulary_size)
#   return bigram_probs

In [None]:
# def add_k_smoothing(bigram_count, unigram_count, vocabulary_size, k):
#   bigram_probs = {}
#   for bigram, count in bigram_count.items():
#       bigram_probs[bigram] = (count + k) / (unigram_count[bigram[0]] + k * vocabulary_size)
#   return bigram_probs

In [None]:
# # Assuming bigram_count and unigram_count are dictionaries with counts
# vocabulary_size = len(unigram_count)  # or the number of unique words in your corpus

# # Laplace Smoothing
# bigram_probs_laplace = laplace_smoothing(bigram_count, unigram_count, vocabulary_size)

# # Add-k Smoothing with k=0.5
# k = 0.5
# bigram_probs_add_k = add_k_smoothing(bigram_count, unigram_count, vocabulary_size, k)

# Perplexity

## Unigrams Perplexity

In [None]:
smoothed_dfs[0.5]

Unnamed: 0,Unigram,Count,Probability,Smoothed Probability
0,stayed,31,0.005860,0.005489
1,four,2,0.000505,0.000484
2,night,47,0.007047,0.006599
3,attending,2,0.000152,0.000153
4,conference,6,0.001061,0.001003
...,...,...,...,...
1539,chechecked,1,0.000000,0.000012
1540,915,1,0.000000,0.000012
1541,15,1,0.000556,0.000531
1542,marked,1,0.000025,0.000035


In [None]:
# Convert DataFrame to a dictionary for quick lookup
unigram_probs = dict(zip(smoothed['Unigram'], unigram_df['Smoothed Probability']))

In [None]:
def dataframe_to_dict(df: pd.DataFrame, key_col: str, value_col: str) -> dict:
  """
  Convert a DataFrame to a dictionary for quick lookup.

  Args:
      df (pd.DataFrame): The DataFrame containing the data.
      key_col (str): The name of the column to use as keys in the dictionary.
      value_col (str): The name of the column to use as values in the dictionary.

  Returns:
      dict: A dictionary with keys and values from the specified columns.
  """
  return dict(zip(df[key_col], df[value_col]))




In [None]:
import numpy as np

In [None]:
def calculate_sentence_perplexity(sentence, unigram_probs):
  N = len(sentence)
  log_prob_sum = 0
  for word in sentence:
      prob = unigram_probs.get(word)
      log_prob_sum += np.log(prob)
  return np.exp(-log_prob_sum / N)

In [None]:
for k in k_values:
  unigram_probs = dataframe_to_dict(smoothed_dfs[k], 'Unigram', 'Smoothed Probability')
  total_perplexity = 0
  for sen in processed_test_reviews:
    total_perplexity += calculate_sentence_perplexity(sen, unigram_probs)
  average_perplexity = total_perplexity / len(processed_test_reviews)
  print("For k", k, " value the perplexity value of the unigram model is:",(average_perplexity))

For k 1  value the perplexity value of the unigram model is: 1381.437857527925
For k 0.5  value the perplexity value of the unigram model is: 1419.1540664629013
For k 2  value the perplexity value of the unigram model is: 1384.569639707693
For k 5  value the perplexity value of the unigram model is: 1501.827942539453
For k 20  value the perplexity value of the unigram model is: 2094.6618309216406


## Bigrams perplexity

In [None]:
import math

def calculate_perplexity(validation_sentences, bigram_probs):
    """Calculate perplexity for a given set of tokenized sentences and bigram probabilities."""
    log_prob_sum = 0
    total_bigrams = 0

    # Iterate through each tokenized sentence in the validation set
    for sentence in validation_sentences:
        if len(sentence) < 2:
            continue  # Skip sentences that are too short for bigrams

        # Iterate through the bigrams in the sentence
        for i in range(1, len(sentence)):
            bigram = (sentence[i-1], sentence[i])
            prob = bigram_probs.get(bigram, 1e-6)
            log_prob_sum += math.log(prob)

        total_bigrams += len(sentence) - 1

    # Calculate perplexity
    if total_bigrams == 0:
        return float('inf')

    avg_log_prob = log_prob_sum / total_bigrams
    perplexity = math.exp(-avg_log_prob)

    return perplexity


# Compute perplexity using Laplace smoothed bigram probabilities
perplexity_laplace = calculate_perplexity(tokenized_reviews, bigram_probs_laplace)
print(f"Perplexity (Laplace smoothing): {perplexity_laplace}")

# Compute perplexity using add-k smoothed bigram probabilities (k=0.5)
perplexity_add_k = calculate_perplexity(tokenized_reviews, bigram_probs_add_k)
print(f"Perplexity (Add-k smoothing): {perplexity_add_k}")


Perplexity (Laplace smoothing): 2045.6420283101
Perplexity (Add-k smoothing): 1319.880257800446
