# A - Dataset and Preprocessing

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import time

In [7]:
start = time.time()
# Download necessary resources for nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/krishna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Reading the training and testing dataset
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# Shape of the datasets
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Randomly sampling 100 rows for validation set
val_df = train_df.sample(n=100, random_state=71)
train_df = train_df.drop(val_df.index)

Train shape: (13879, 2)
Test shape: (100, 2)


In [9]:
# Text Preprocessing Function
def preprocess_text(text):
    text = text.encode("ascii", "ignore").decode() # Remove non-ASCII characters
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuations
    text = text.lower() # Convert to lowercase
    words = text.split() # Tokenization
    stop_words = set(stopwords.words('english')) # Remove stopwords

    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Reconstruct sentence
    return " ".join(words)

In [10]:
# Applying preprocessing
train_df["text"] = train_df["text"].astype(str).apply(preprocess_text)
val_df["text"] = val_df["text"].astype(str).apply(preprocess_text)
test_df["text"] = test_df["text"].astype(str).apply(preprocess_text)

In [11]:
# Save preprocessed datasets
train_df.to_csv("train_preprocessed.csv", index=False)
val_df.to_csv("val_preprocessed.csv", index=False)
test_df.to_csv("test_preprocessed.csv", index=False)

print("Preprocessing complete. Files saved!")

end = time.time()
print("Time taken for section A: ", end-start)

Preprocessing complete. Files saved!
Time taken for section A:  44.0560188293457


# B - Estimation Using Maximum Likelihood

In [12]:
import json
import pandas as pd
from collections import defaultdict

In [13]:
start = time.time()

# Load preprocessed training dataset
train_df = pd.read_csv("train_preprocessed.csv")
train_sentences = [sentence.split() for sentence in train_df["text"]]

In [14]:
# Define minimum document frequency threshold (1% of total articles)
num_articles = len(train_sentences)
min_doc_freq = max(1, int(0.01 * num_articles))  # At least 1% of articles

In [15]:
# Function to count n-grams and track article frequency
def count_ngrams(sentences, n):
    counts = defaultdict(int)
    doc_freq = defaultdict(int)

    for _, sentence in enumerate(sentences):
        if not sentence:  # Skip empty sentences
            continue

        # Add start and end tokens
        tokens = ["<s>"] * (n - 1) + sentence + ["</s>"]
        seen_ngrams = set()

        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            counts[ngram] += 1
            seen_ngrams.add(ngram)

        # Update document frequency count
        for ngram in seen_ngrams:
            doc_freq[ngram] += 1

    # Filter n-grams appearing in at least 1% of articles
    filtered_counts = {
        ngram: count for ngram, count in counts.items() if doc_freq[ngram] >= min_doc_freq
    }
    total_filtered_ngrams = sum(filtered_counts.values())

    return filtered_counts, total_filtered_ngrams

In [16]:
# Compute unigram, bigram, and trigram counts with filtering
unigram_counts, total_unigrams = count_ngrams(train_sentences, 1)
bigram_counts, total_bigrams = count_ngrams(train_sentences, 2)
trigram_counts, total_trigrams = count_ngrams(train_sentences, 3)

In [17]:
# Function to convert counts to probabilities with Laplace smoothing
def compute_probabilities(counts, total, vocab_size, smoothing=1):
    probs = {}
    for ngram, count in counts.items():
        probs[str(ngram)] = (count + smoothing) / (total + vocab_size * smoothing)
    return probs

In [18]:
# Compute vocabulary sizes for unigram, bigram, and trigram models
vocab_size_unigram = len(unigram_counts)
vocab_size_bigram = len(bigram_counts)
vocab_size_trigram = len(trigram_counts)

In [19]:
# Compute probabilities with smoothing
unigram_probs = compute_probabilities(unigram_counts, total_unigrams, vocab_size_unigram)
bigram_probs = compute_probabilities(bigram_counts, total_bigrams, vocab_size_bigram)
trigram_probs = compute_probabilities(trigram_counts, total_trigrams, vocab_size_trigram)

In [20]:
# Save filtered n-gram probabilities to JSON files
with open("unigram_probs.json", "w") as f:
    json.dump(unigram_probs, f, indent=4)

with open("bigram_probs.json", "w") as f:
    json.dump(bigram_probs, f, indent=4)

with open("trigram_probs.json", "w") as f:
    json.dump(trigram_probs, f, indent=4)

print("Filtered n-gram probability files generated successfully!")


end = time.time()
print("Time taken for section B: ", end-start)

Filtered n-gram probability files generated successfully!
Time taken for section B:  52.05514216423035


# C - Evaluating an n-Gram Model using Perplexity

In [21]:
import numpy as np

In [None]:
start = time.time()

# Load n-gram probability models
with open("unigram_probs.json", "r") as f:
    unigram_probs = json.load(f)

with open("bigram_probs.json", "r") as f:
    bigram_probs = json.load(f)

with open("trigram_probs.json", "r") as f:
    trigram_probs = json.load(f)

In [23]:
# Load test dataset
test_df = pd.read_csv("test_preprocessed.csv")

In [24]:
# Convert test text into a list of tokenized sentences
test_sentences = [sentence.split() for sentence in test_df["text"]]

In [None]:
# Compute perplexity for a sentence using n-gram probabilities
def compute_perplexity(ngrams_probs, sentence, n):
    if n == 1:
        ngram_list = [(word,) for word in sentence]  # Convert to unigram format
    else:
        ngram_list = [tuple(sentence[i:i+n]) for i in range(len(sentence)-n+1)]

    # Compute log probabilities
    log_probs = [np.log(ngrams_probs.get(str(ngram), 1e-10)) for ngram in ngram_list]
    log_prob_sum = np.sum(log_probs)

    # Compute perplexity
    total_ngrams = len(ngram_list)
    perplexity = np.exp(-log_prob_sum / total_ngrams) if total_ngrams > 0 else float('inf')
    return perplexity

In [26]:
# Compute perplexities for each model
unigram_perplexities = [compute_perplexity(unigram_probs, sentence, 1) for sentence in test_sentences]
bigram_perplexities = [compute_perplexity(bigram_probs, sentence, 2) for sentence in test_sentences]
trigram_perplexities = [compute_perplexity(trigram_probs, sentence, 3) for sentence in test_sentences]

In [27]:
# Compute overall average perplexity for each model
avg_unigram_perplexity = np.mean(unigram_perplexities)
avg_bigram_perplexity = np.mean(bigram_perplexities)
avg_trigram_perplexity = np.mean(trigram_perplexities)

In [28]:
# Print results
print(f"Average Unigram Perplexity: {avg_unigram_perplexity:.4f}")
print(f"Average Bigram Perplexity: {avg_bigram_perplexity:.4f}")
print(f"Average Trigram Perplexity: {avg_trigram_perplexity:.4f}")

end = time.time()
print("Time taken for section C: ", end-start)

Average Unigram Perplexity: 85023.8164
Average Bigram Perplexity: 1643368727.4968
Average Trigram Perplexity: 6495576203.9299
Time taken for section C:  0.4392430782318115


# D - Interpolation Model


In [29]:
start = time.time()

# Load validation dataset
val_df = pd.read_csv("val_preprocessed.csv")
val_sentences = [sentence.split() for sentence in val_df["text"]]

# Load test dataset
test_df = pd.read_csv("test_preprocessed.csv")
test_sentences = [sentence.split() for sentence in test_df["text"]]

In [30]:
# Computes probability using interpolation smoothing 
def interpolate_prob(wi, history, lambdas):
    unigram_prob = unigram_probs.get(str((wi,)), 1e-10)  # Use default prob if missing
    bigram_prob = bigram_probs.get(str(tuple(history[-1:] + [wi])), unigram_prob)
    trigram_prob = trigram_probs.get(str(tuple(history[-2:] + [wi])), bigram_prob)

    return lambdas[0] * unigram_prob + lambdas[1] * bigram_prob + lambdas[2] * trigram_prob

In [31]:
# Compute perplexity for sentences using interpolated probabilities
def compute_interpolated_perplexity(sentences, lambdas):
    log_prob_sum = 0
    total_words = 0

    for sentence in sentences:
        for i in range(len(sentence)):
            wi = sentence[i]
            history = sentence[max(0, i - 2):i]  # Use up to last two words as history
            prob = interpolate_prob(wi, history, lambdas)

            log_prob_sum += np.log(max(prob, 1e-10))  # Prevent log(0) issues
            total_words += 1

    return np.exp(-log_prob_sum / total_words) if total_words > 0 else float('inf')

In [32]:
# Optimize lambda values using the validation set
best_perplexity = float('inf')
best_lambdas = None

for lambda1 in np.arange(0.1, 0.9, 0.1):
    for lambda2 in np.arange(0.1, 1 - lambda1, 0.1):
        lambda3 = 1 - (lambda1 + lambda2)
        if lambda3 <= 0:
            continue
        lambdas = [lambda1, lambda2, lambda3]
        perplexity = compute_interpolated_perplexity(val_sentences, lambdas)

        if perplexity < best_perplexity:
            best_perplexity = perplexity
            best_lambdas = lambdas

print(f"Optimal λ values: λ1={best_lambdas[0]:.2f}, λ2={best_lambdas[1]:.2f}, λ3={best_lambdas[2]:.2f}")
print(f"Validation Perplexity: {best_perplexity:.4f}")

Optimal λ values: λ1=0.80, λ2=0.10, λ3=0.10
Validation Perplexity: 39725.1950


In [33]:
# Evaluate on the test set using the best lambda values
test_perplexity = compute_interpolated_perplexity(test_sentences, best_lambdas)
print(f"Test Perplexity using Interpolation Model: {test_perplexity:.4f}")

end = time.time()
print("Time taken for section D: ", end-start)

Test Perplexity using Interpolation Model: 35222.4560
Time taken for section D:  10.085712194442749
