In [1]:
#pip install pandas

In [2]:
import pandas as pd
import nltk
from collections import Counter,defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import math
import random
import re
from itertools import product
import time

In [3]:
# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
#print("NLTK setup completed!")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/krishna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
train_path = 'data/train.csv'
test_path = 'data/test.csv' 

In [5]:
# Loading training and testing dataset
def load_dataset(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    train_corpus = train_data['text'].tolist()
    test_corpus = test_data['text'].tolist()
    return train_data,test_data,train_corpus,test_corpus

In [6]:
train_data,test_data,train_corpus,test_corpus=load_dataset(train_path, test_path)

print(len(train_data))
print(len(test_data))

13879
100


In [40]:
def display_dataset_summaries(train_data,test_data):
    print("train Data:")
    print(train_data.head())
    print(f"Shape: {train_data.shape}")
    print("\ntest Data:")
    print(test_data.head())
    print(f"Shape:{test_data.shape}")

In [41]:
#display_dataset_summaries(train_data, test_data)

In [42]:
#preprocessing
def preprocess_text(text):
    if isinstance(text, list):  
        text = ' '.join(text)
    text =re.sub(r'[^\x00-\x7F]+', ' ', text)       #remove non-ASCII
    text =re.sub(r'[^\w\s]', ' ', text)             #remove punctuation
    tokens = word_tokenize(text.lower())            #tokenize and lowercase
    stop_words = set(stopwords.words('english'))    #stopwords removal
    lemmatizer = WordNetLemmatizer()                #lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [43]:
#spliting validation set from training set
def split_validation_set(train_corpus, validation_size=100):
    random.shuffle(train_corpus)
    validation_corpus =train_corpus[:validation_size]
    train_corpus=train_corpus[validation_size:]
    return train_corpus, validation_corpus

In [44]:
#display validation set
train_corpus,validation_corpus=split_validation_set(train_corpus)
print("Validation Corpus Sample:")
print(validation_corpus[:5])

Validation Corpus Sample:
['Montenegro (; , , , ; ) is a country in Southeastern Europe. It is located on the Adriatic Sea and is a part of the Balkans, sharing borders with Serbia to the northeast, Bosnia and Herzegovina to the north and west, Kosovo to the east, Albania to the southeast, the Adriatic Sea and Croatia to the southwest, and maritime boundary with Italy. Podgorica, the capital and largest city, covers 10.4% of Montenegro\'s territory of , and is home to roughly 30% of its total population of 621,000.\n\nDuring the Early Medieval period, three principalities were located on the territory of modern-day Montenegro: Duklja, roughly corresponding to the southern half; Travunia, the west; and Rascia proper, the north. The Principality of Zeta emerged in the 14th and 15th centuries. From the late 14th century to the late 18th century, large parts of southern Montenegro were ruled by the Venetian Republic and incorporated into Venetian Albania. The name Montenegro was first used

In [45]:
#preprocess data
start_time = time.time()
train_corpus=[preprocess_text(text) for text in train_corpus]
test_corpus=[preprocess_text(text) for text in test_corpus]
validation_corpus =[preprocess_text(text) for text in validation_corpus]
preprocessing_time = time.time() - start_time

In [46]:
print(f"Preprocessing Time: {preprocessing_time:.2f} seconds")

Preprocessing Time: 109.54 seconds


B:ESTIMATION USING MLE

In [48]:
#generate n-gram models
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]

In [49]:
# #Count n-grams in corpus
# def count_ngrams(corpus, n):
#     ngram_counts = Counter()
#     for tokens in corpus:
#         #tokens = preprocess_text(text)  # Tokenize the text
#         ngrams = generate_ngrams(tokens, n)
#         ngram_counts.update(ngrams)
#     return ngram_counts

In [50]:
# Count n-grams in corpus with 1% filtering
def count_ngrams_with_filter(corpus, n, min_doc_percentage):
    ngram_counts = Counter()
    doc_counts = defaultdict(int)
    num_docs = len(corpus)
    for tokens in corpus:
        ngrams = set(generate_ngrams(tokens, n))  # Using set to count unique n-grams per document
        for ngram in ngrams:
            doc_counts[ngram] += 1
    
    # Filter n-grams that appear in at least min_doc_percentage(1%) of documents
    min_doc_threshold = (min_doc_percentage / 100) * num_docs
    for ngram, doc_count in doc_counts.items():
        if doc_count >= min_doc_threshold:
            ngram_counts[ngram] = doc_count
    return ngram_counts

In [51]:
# Count n-grams with 1% filtering
start_time = time.time()
min_doc_percentage = 1           
unigram_counts = count_ngrams_with_filter(train_corpus, 1, min_doc_percentage)
bigram_counts = count_ngrams_with_filter(train_corpus, 2, min_doc_percentage)
trigram_counts = count_ngrams_with_filter(train_corpus, 3, min_doc_percentage)
ngram_counting_time = time.time() - start_time

In [52]:
print(f"N-gram Counting Time: {ngram_counting_time:.2f} seconds")

N-gram Counting Time: 29.99 seconds


In [53]:
#debugging:Checking sizes of n-gram models
print("Unigram Count:", len(unigram_counts))
print("Bigram Count:", len(bigram_counts))
print("Trigram Count:", len(trigram_counts))

Unigram Count: 9322
Bigram Count: 7739
Trigram Count: 1326


In [54]:
#saveing n-gram models to CSV
def save_ngram_model_to_csv(ngram_counts, file_path):
    df =pd.DataFrame(ngram_counts.items(),columns=['ngram','count'])
    df.to_csv(file_path,index=False)

In [55]:
save_ngram_model_to_csv(unigram_counts,'unigram_model.csv')
save_ngram_model_to_csv(bigram_counts,'bigram_model.csv')
save_ngram_model_to_csv(trigram_counts,'trigram_model.csv')

In [56]:
#probabilities with Laplace smoothing
def compute_probabilities(ngram_counts,lower_order_counts,vocab_size, k=5):
    probabilities = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]
        prefix_count = lower_order_counts.get(prefix, 0) if lower_order_counts else sum(ngram_counts.values())
        probabilities[ngram] = (count + k) / (prefix_count + k * vocab_size)
    return probabilities

In [57]:
#vocabulary size
vocab_size = len(set(word for tokens in train_corpus for word in tokens))

In [58]:
# Compute probabilities
unigram_probs = compute_probabilities(unigram_counts, {}, vocab_size, k=1)
bigram_probs = compute_probabilities(bigram_counts, unigram_counts, vocab_size, k=1)
trigram_probs = compute_probabilities(trigram_counts, bigram_counts, vocab_size, k=1)

C:EVALUATING USING PERPLEXITY

In [None]:
# Calculate perplexity
def calculate_perplexity(test_corpus, n, probabilities, vocab_size, k=1):
    total_log_prob = 0
    total_words = 0
    for tokens in test_corpus:
        #tokens = preprocess_text(text)
        ngrams = generate_ngrams(tokens, n)
        for ngram in ngrams:
            prob = probabilities.get(ngram, k / vocab_size)
            total_log_prob += math.log(prob, 2)
        total_words += len(ngrams)
    return 2 ** (-total_log_prob / total_words)

In [60]:
start_time = time.time()
unigram_perplexity=calculate_perplexity(test_corpus, 1, unigram_probs, vocab_size, k=1)
bigram_perplexity= calculate_perplexity(test_corpus, 2, bigram_probs, vocab_size, k=1)
trigram_perplexity=calculate_perplexity(test_corpus, 3, trigram_probs, vocab_size, k=1)
perplexity_calc_time = time.time() - start_time

In [61]:
print(f"Perplexity Calculation Time: {perplexity_calc_time:.2f} seconds")

Perplexity Calculation Time: 0.11 seconds


In [62]:
print(f"Unigram Perplexity:{unigram_perplexity}")
print(f"Bigram Perplexity:{bigram_perplexity}")
print(f"Trigram Perplexity:{trigram_perplexity}")

Unigram Perplexity:6422.015347734478
Bigram Perplexity:129065.73676239813
Trigram Perplexity:314517.2483936922


D:INTERPOLATION MODEL

In [63]:
#interpolated probabilities
def compute_interpolated_prob(unigram_probs,bigram_probs,trigram_probs, ngram,lambdas):
    lambda1, lambda2, lambda3 = lambdas
    unigram_prob =unigram_probs.get((ngram[-1],),0)
    bigram_prob =bigram_probs.get(ngram[-2:],0)
    trigram_prob =trigram_probs.get(ngram, 0)
    return lambda1*unigram_prob+lambda2*bigram_prob+lambda3*trigram_prob

In [64]:
#interpolation perplexity
def calculate_interpolation_perplexity(test_corpus,unigram_probs,bigram_probs,trigram_probs,vocab_size,lambdas,k=1):
    total_log_prob = 0
    total_words = 0
    lambda1, lambda2, lambda3 = lambdas
    for tokens in test_corpus:
        #tokens = preprocess_text(text)
        trigrams = generate_ngrams(tokens, 3)
        for trigram in trigrams:
            unigram =(trigram[-1],)
            bigram =trigram[1:]
            trigram_prob =trigram_probs.get(trigram, k / vocab_size)
            bigram_prob =bigram_probs.get(bigram, k / vocab_size)
            unigram_prob =unigram_probs.get(unigram, k / vocab_size)

            prob = lambda3 *trigram_prob+lambda2*bigram_prob+lambda1*unigram_prob
            total_log_prob += math.log(prob, 2)
        total_words += len(trigrams)
    return 2 ** (-total_log_prob / total_words)

In [65]:
# # Define smoothing parameter and vocab size
# k = 1  # Adjust smoothing parameter as needed

# for tokens in validation_corpus[:5]:  # Test on a few sentences
#     for trigram in generate_ngrams(tokens, 3):
#         unigram = (trigram[-1],)
#         bigram = trigram[1:]
        
#         trigram_prob = trigram_probs.get(trigram, k / vocab_size)
#         bigram_prob = bigram_probs.get(bigram, k / vocab_size)
#         unigram_prob = unigram_probs.get(unigram, k / vocab_size)
#         print(f"Trigram: {trigram}, P(Tri): {trigram_prob}, P(Bi): {bigram_prob}, P(Uni): {unigram_prob}")

In [66]:
#grid search to find the best lambda values
def grid_search_lambdas(test_corpus,unigram_probs ,bigram_probs ,trigram_probs ,vocab_size ,k=1):
    best_perplexity = float('inf')
    best_lambdas = None
    lambda_values = [0.1 * i for i in range(1, 10)] 

    for lambdas in product(lambda_values, repeat=3):
        if sum(lambdas) == 1:                  
            perplexity =calculate_interpolation_perplexity(
                test_corpus,unigram_probs,bigram_probs,trigram_probs,vocab_size,lambdas,k=k
            )
            if perplexity < best_perplexity:
                best_perplexity =perplexity
                best_lambdas =lambdas
    return best_lambdas,best_perplexity
optimal_lambdas, optimal_perplexity =grid_search_lambdas(
    test_corpus,unigram_probs, bigram_probs,trigram_probs,vocab_size,k=1)
print(f"Optimal Lambdas:{optimal_lambdas}")
print(f"Optimal Interpolation Perplexity:{optimal_perplexity}")

Optimal Lambdas:(0.8, 0.1, 0.1)
Optimal Interpolation Perplexity:7222.871744277145
