In [None]:
## IMPORT AND LOAD CORPORA ##

# Import NumPy (Numerical Python) package with alias np
import numpy as np
# Counter subclass: used to count hashable objects
# -- for ngram counts
from collections import Counter
# islice: iterator function for help with print statements
from itertools import islice

# Function: load corpus txt file and tokenize
def load_corpus(file_path):
    with open(file_path, 'r') as text_file:
        lines = text_file.readlines()
    # tokenize: remove excess whitespace, all lower-case, split by whitespace
    corpus = [line.strip().lower().split() for line in lines]
    return corpus

# Load corpora
train_corpus = load_corpus('train.txt')
val_corpus = load_corpus('val.txt')

# (OPTIONAL) Show preview of the split data
print("Train Corpus Sample:", train_corpus[:1])  # Show first line
print("Validation Corpus Sample:", val_corpus[:1])

Train Corpus Sample: [['i', 'booked', 'two', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'talbott', '.', 'we', 'were', 'placed', 'on', 'the', 'top', 'floor', 'next', 'to', 'the', 'elevators', ',', 'which', 'are', 'used', 'all', 'night', 'long', '.', 'when', 'speaking', 'to', 'the', 'front', 'desk', ',', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', 'honoring', 'my', 'request', 'for', 'an', 'upper', 'floor', ',', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view', '.', 'i', 'am', 'looking', 'at', 'a', 'brick', 'wall', ',', 'and', 'getting', 'no', 'sleep', '.', 'he', 'also', 'told', 'me', 'that', 'they', 'had', 'received', 'complaints', 'before', 'from', 'guests', 'on', 'the', '16th', 'floor', ',', 'and', 'were', 'aware', 'of', 'the', 'noise', 'problem', '.', 'why', 'then', 'did', 'they', 'place', 'us', 'on', 'this', 'floor', 'when', 'the', 'hotel', 'is', 'not', 'totally', 'booked', '?', 'a', 'request', 'for', 'an', 'upper', 'floor', 'does', 'not', 'const

In [None]:
## UNKNOWN WORD HANDLING ##

# Replace words below a threshold with <UNK>

# Step 1: Count word frequencies
word_counts = Counter(word for sentence in train_corpus for word in sentence)

# Step 2: Set a threshold
# -- if a word appears less than 2 times in the training corpus, replace it
# -- otherwise, add it to the vocabulary list
threshold = 2
vocab = {word for word, count in word_counts.items() if count > threshold}

# Replace rare words with `<UNK>` in a single line using list comprehension
train_corpus_modified = [[word if word in vocab else "<UNK>" for word in sentence] for sentence in train_corpus]

# Output results
print("Original Corpus Sample:", train_corpus[:1])
print("Modified Corpus Sample:", train_corpus_modified[:1])


Original Corpus Sample: [['i', 'booked', 'two', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'talbott', '.', 'we', 'were', 'placed', 'on', 'the', 'top', 'floor', 'next', 'to', 'the', 'elevators', ',', 'which', 'are', 'used', 'all', 'night', 'long', '.', 'when', 'speaking', 'to', 'the', 'front', 'desk', ',', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', 'honoring', 'my', 'request', 'for', 'an', 'upper', 'floor', ',', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view', '.', 'i', 'am', 'looking', 'at', 'a', 'brick', 'wall', ',', 'and', 'getting', 'no', 'sleep', '.', 'he', 'also', 'told', 'me', 'that', 'they', 'had', 'received', 'complaints', 'before', 'from', 'guests', 'on', 'the', '16th', 'floor', ',', 'and', 'were', 'aware', 'of', 'the', 'noise', 'problem', '.', 'why', 'then', 'did', 'they', 'place', 'us', 'on', 'this', 'floor', 'when', 'the', 'hotel', 'is', 'not', 'totally', 'booked', '?', 'a', 'request', 'for', 'an', 'upper', 'floor', 'does', 'not', 'co

In [None]:
## COMPUTE UNIGRAM COUNT AND PROBABILITY ##

# Flattens all lines into a single line
line_in_train_corpus = [word for line in train_corpus_modified for word in line]

# Calculate frequency for each unigram in the training corpus (after <UNK> replacement)
unigram_freq = Counter(line_in_train_corpus)
# Sum word frequencies to get total words in the corpus
total_words = sum(unigram_freq.values())

# Calculate probabilites for unigrams (after <UNK> replacement)
unigram_probs = {word: count / total_words for word, count in unigram_freq.items()}

# Print first 10 unigram counts and probabilities
print("First 10 Unigram Counts and Probabilities:")
for word, count in islice(unigram_freq.items(), 10):
    print(f"{word}: {count}, {unigram_probs.get(word, 0):.4f}")

First 10 Unigram Counts and Probabilities:
i: 1706, 0.0190
booked: 86, 0.0010
two: 128, 0.0014
rooms: 201, 0.0022
four: 20, 0.0002
months: 8, 0.0001
in: 1259, 0.0140
advance: 7, 0.0001
at: 745, 0.0083
the: 5292, 0.0590


In [None]:
## COMPUTE BIGRAM COUNT AND PROBABILITY ##

# Flatten all lines into a single line
line_in_train_corpus = [word for line in train_corpus_modified for word in line]

# Calculate count of bigrams in the training corpus (after <UNK> replacement)
bigram_counts = Counter()

for i in range(len(line_in_train_corpus) - 1):  # Loop through words in the line
    bigram = (line_in_train_corpus[i], line_in_train_corpus[i + 1])  # Create bigram tuple
    bigram_counts[bigram] += 1  # Update count

# Calculate probabilities for bigrams (after <UNK> replacement)
bigram_probs = {bigram: count / unigram_freq.get(bigram[0], 1) for bigram, count in bigram_counts.items()} #a little confused on this formula, autocomplete code helped me

# Print first 10 bigram counts and probabilities
print("First 10 Bigram Counts and Probabilites:")
for bigram, count in islice(bigram_counts.items(), 10):
    print(f"{bigram}: {count}, {bigram_probs.get(bigram, 0):.4f}")


First 10 Bigram Counts and Probabilites:
('i', 'booked'): 21, 0.0123
('booked', 'two'): 1, 0.0116
('two', 'rooms'): 3, 0.0234
('rooms', 'four'): 1, 0.0050
('four', 'months'): 1, 0.0500
('months', 'in'): 2, 0.2500
('in', 'advance'): 7, 0.0056
('advance', 'at'): 1, 0.1429
('at', 'the'): 332, 0.4456
('the', 'talbott'): 26, 0.0049


In [None]:
## Implement at least two smoothing techniques (e.g., Laplace, Add-k smoothing) ##

In [None]:
## LAPLACE SMOOTHING: UNIGRAM PROBABILITIES ##

V = len(unigram_freq)  # Vocab size

# Apply Laplace smoothing to all unigram counts
unigram_probs_smoothed = {word: (count + 1) / (sum(unigram_freq.values()) + V) for word, count in unigram_freq.items()}

# Print unigram probabilities after adding 1 to all counts for Laplace smoothing
print("Smoothed Unigram Probabilities:")
for word, prob in islice(unigram_probs_smoothed.items(),10):
    print(f"{word}: {prob:.4f}")

Smoothed Unigram Probabilities:
i: 0.0186
booked: 0.0009
two: 0.0014
rooms: 0.0022
four: 0.0002
months: 0.0001
in: 0.0137
advance: 0.0001
at: 0.0081
the: 0.0576


In [None]:
## LAPLACE SMOOTHING: BIGRAM PROBABILIES ##

# Apply Laplace smoothing to all bigram counts
bigram_probs_smoothed = {bigram: (count + 1) / (unigram_freq.get(bigram[0], 0) + V) for bigram, count in bigram_counts.items()}

# Print bigram probability after added 1 to all counts for Laplace smoothing
print("Smoothed Bigram Probabilities:")
for bigram, prob in islice(bigram_probs_smoothed.items(), 10):
    print(f"{bigram}: {prob:.4f}")

Smoothed Bigram Probabilities:
('i', 'booked'): 0.0056
('booked', 'two'): 0.0009
('two', 'rooms'): 0.0017
('rooms', 'four'): 0.0008
('four', 'months'): 0.0009
('months', 'in'): 0.0013
('in', 'advance'): 0.0023
('advance', 'at'): 0.0009
('at', 'the'): 0.1113
('the', 'talbott'): 0.0036


In [None]:
## K-SMOOTHING: UNIGRAM - multiple values (a-d) ##

ka = 0.01
kb = 0.1
kc = 0.3
kd = 5

# Apply K-Smoothing values ka -> kd
unigram_probs_addka = {word: (count + ka) / (sum(unigram_freq.values()) + ka * V) for word, count in unigram_freq.items()}
unigram_probs_addkb = {word: (count + kb) / (sum(unigram_freq.values()) + kb * V) for word, count in unigram_freq.items()}
unigram_probs_addkc = {word: (count + kc) / (sum(unigram_freq.values()) + kc * V) for word, count in unigram_freq.items()}
unigram_probs_addkd = {word: (count + kd) / (sum(unigram_freq.values()) + kd * V) for word, count in unigram_freq.items()}

print("Add-k Smoothed Unigram Probabilities:")
print("K = 0.01:")
for word, prob in islice(unigram_probs_addka.items(), 10):
    print(f"{word}: {prob:.4f}")
print("K = 0.1:")
for word, prob in islice(unigram_probs_addkb.items(), 10):
    print(f"{word}: {prob:.4f}")
print("K = 0.3:")
for word, prob in islice(unigram_probs_addkc.items(), 10):
    print(f"{word}: {prob:.4f}")
print("K = 5:")
for word, prob in islice(unigram_probs_addkd.items(), 10):
    print(f"{word}: {prob:.4f}")

Add-k Smoothed Unigram Probabilities:
K = 0.01:
i: 0.0190
booked: 0.0010
two: 0.0014
rooms: 0.0022
four: 0.0002
months: 0.0001
in: 0.0140
advance: 0.0001
at: 0.0083
the: 0.0590
K = 0.1:
i: 0.0190
booked: 0.0010
two: 0.0014
rooms: 0.0022
four: 0.0002
months: 0.0001
in: 0.0140
advance: 0.0001
at: 0.0083
the: 0.0589
K = 0.3:
i: 0.0189
booked: 0.0010
two: 0.0014
rooms: 0.0022
four: 0.0002
months: 0.0001
in: 0.0139
advance: 0.0001
at: 0.0082
the: 0.0586
K = 5:
i: 0.0170
booked: 0.0009
two: 0.0013
rooms: 0.0020
four: 0.0002
months: 0.0001
in: 0.0125
advance: 0.0001
at: 0.0074
the: 0.0525


In [None]:
## K-SMOOTHING: BIGRAM - multiple values (a-d) ##

# Apply K-Smoothing values ka -> kd
bigram_probs_addka = {bigram: (count + ka) / (unigram_freq.get(bigram[0], 0) + ka * V) for bigram, count in bigram_counts.items()}
bigram_probs_addkb = {bigram: (count + kb) / (unigram_freq.get(bigram[0], 0) + kb * V) for bigram, count in bigram_counts.items()}
bigram_probs_addkc = {bigram: (count + kc) / (unigram_freq.get(bigram[0], 0) + kc * V) for bigram, count in bigram_counts.items()}
bigram_probs_addkd = {bigram: (count + kd) / (unigram_freq.get(bigram[0], 0) + kd * V) for bigram, count in bigram_counts.items()}

print("Add-k Smoothed Bigram Probabilities:")
print("K = 0.01:")
for bigram, prob in islice(bigram_probs_addka.items(), 10):
    print(f"{bigram}: {prob:.4f}")
print("K = 0.1:")
for bigram, prob in islice(bigram_probs_addkb.items(), 10):
    print(f"{bigram}: {prob:.4f}")
print("K = 0.3:")
for bigram, prob in islice(bigram_probs_addkc.items(), 10):
    print(f"{bigram}: {prob:.4f}")
print("K = 5:")
for bigram, prob in islice(bigram_probs_addkd.items(), 10):
    print(f"{bigram}: {prob:.4f}")

Add-k Smoothed Bigram Probabilities:
K = 0.01:
('i', 'booked'): 0.0122
('booked', 'two'): 0.0093
('two', 'rooms'): 0.0200
('rooms', 'four'): 0.0045
('four', 'months'): 0.0238
('months', 'in'): 0.0660
('in', 'advance'): 0.0055
('advance', 'at'): 0.0343
('at', 'the'): 0.4326
('the', 'talbott'): 0.0049
K = 0.1:
('i', 'booked'): 0.0109
('booked', 'two'): 0.0035
('two', 'rooms'): 0.0088
('rooms', 'four'): 0.0026
('four', 'months'): 0.0045
('months', 'in'): 0.0090
('in', 'advance'): 0.0048
('advance', 'at'): 0.0047
('at', 'the'): 0.3425
('the', 'talbott'): 0.0047
K = 0.3:
('i', 'booked'): 0.0089
('booked', 'two'): 0.0017
('two', 'rooms'): 0.0041
('rooms', 'four'): 0.0015
('four', 'months'): 0.0019
('months', 'in'): 0.0034
('in', 'advance'): 0.0038
('advance', 'at'): 0.0019
('at', 'the'): 0.2342
('the', 'talbott'): 0.0044
K = 5:
('i', 'booked'): 0.0020
('booked', 'two'): 0.0005
('two', 'rooms'): 0.0007
('rooms', 'four'): 0.0005
('four', 'months'): 0.0005
('months', 'in'): 0.0006
('in', 'advan

In [None]:
## CALCULATE PERPLEXITY ##

# Function: to calulate perplexity
# -- corpus: tokenized validation set
# -- ngram_probs: previously calculated
# -- n set to 1 for unigram and 2 for bigram
# -- unk_prob set to 1e-10 so that log(0) = -∞ doesn't break the code
def calculate_perplexity(corpus, ngram_probs, n, unk_prob=1e-10):

    #initialize variables
    total_log_prob = 0
    word_count = 0

    for sentence in corpus:
        if n == 1: # unigram
            for word in sentence:
                prob = ngram_probs.get(word, ngram_probs.get("<UNK>", unk_prob))  # Use <UNK> prob if unigram is unknown
                total_log_prob += np.log(prob)
                word_count += 1
        elif n == 2: # bigram
            for i in range(1, len(sentence)):  # Start from the second word
                bigram = (sentence[i - 1], sentence[i])
                prob = ngram_probs.get(bigram, ngram_probs.get(("<UNK>", "<UNK>"), unk_prob))  # Use <UNK> prob if bigram is unknown
                total_log_prob += np.log(prob)
                word_count += 1

    return np.exp(-total_log_prob / word_count)  # Perplexity formula

# Compute perplexity for different models with the validation corpus
unigram_perplexity = calculate_perplexity(val_corpus, unigram_probs, n=1)
bigram_perplexity = calculate_perplexity(val_corpus, bigram_probs, n=2)
unigram_perplexity_smoothed = calculate_perplexity(val_corpus, unigram_probs_smoothed, n=1)
bigram_perplexity_smoothed = calculate_perplexity(val_corpus, bigram_probs_smoothed, n=2)
unigram_perplexity_addka = calculate_perplexity(val_corpus, unigram_probs_addka, n=1)
unigram_perplexity_addkb = calculate_perplexity(val_corpus, unigram_probs_addkb, n=1)
unigram_perplexity_addkc = calculate_perplexity(val_corpus, unigram_probs_addkc, n=1)
unigram_perplexity_addkd = calculate_perplexity(val_corpus, unigram_probs_addkd, n=1)
bigram_perplexity_addka = calculate_perplexity(val_corpus, bigram_probs_addka, n=2)
bigram_perplexity_addkb = calculate_perplexity(val_corpus, bigram_probs_addkb, n=2)
bigram_perplexity_addkc = calculate_perplexity(val_corpus, bigram_probs_addkc, n=2)
bigram_perplexity_addkd = calculate_perplexity(val_corpus, bigram_probs_addkd, n=2)

# Print the perplexities
print(f"Unsmoothed Unigram Perplexity: {unigram_perplexity:.4f}")
print(f"Laplace Smoothed Unigram Perplexity: {unigram_perplexity_smoothed:.4f}")
print(f"Add-k: K = 0.01 Smoothed Unigram Perplexity: {unigram_perplexity_addka:.4f}")
print(f"Add-k: K = 0.1 Smoothed Unigram Perplexity: {unigram_perplexity_addkb:.4f}")
print(f"Add-k: K = 0.3 Smoothed Unigram Perplexity: {unigram_perplexity_addkc:.4f}")
print(f"Add-k: K = 5 Smoothed Unigram Perplexity: {unigram_perplexity_addkd:.4f}\n")

print(f"Unsmoothed Bigram Perplexity: {bigram_perplexity:.4f}")
print(f"Laplace Smoothed Bigram Perplexity: {bigram_perplexity_smoothed:.4f}")
print(f"Add-k: K = 0.01 Smoothed Bigram Perplexity: {bigram_perplexity_addka:.4f}")
print(f"Add-k: K = 0.1 Smoothed Bigram Perplexity: {bigram_perplexity_addkb:.4f}")
print(f"Add-k: K = 0.3 Smoothed Bigram Perplexity: {bigram_perplexity_addkc:.4f}")
print(f"Add-k: K = 5 Smoothed Bigram Perplexity: {bigram_perplexity_addkd:.4f}")

Unsmoothed Unigram Perplexity: 254.7311
Laplace Smoothed Unigram Perplexity: 255.7838
Add-k: K = 0.01 Smoothed Unigram Perplexity: 254.7387
Add-k: K = 0.1 Smoothed Unigram Perplexity: 254.8103
Add-k: K = 0.3 Smoothed Unigram Perplexity: 254.9883
Add-k: K = 5 Smoothed Unigram Perplexity: 263.1593

Unsmoothed Bigram Perplexity: 23.1438
Laplace Smoothed Bigram Perplexity: 95.3924
Add-k: K = 0.01 Smoothed Bigram Perplexity: 26.5051
Add-k: K = 0.1 Smoothed Bigram Perplexity: 39.6675
Add-k: K = 0.3 Smoothed Bigram Perplexity: 56.4531
Add-k: K = 5 Smoothed Bigram Perplexity: 232.4393
