In [1]:
from typing import List, Tuple, Dict
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Ensure necessary NLTK tokenizer models are available
nltk.download("punkt")

class Tokenizer:
    def __init__(self, tokenize_type: str = "basic", lowercase: bool = False):
        self.lowercase = lowercase
        self.type = tokenize_type
        self.vocab = []  # Empty vocabulary list

    def basicTokenize(self, string: str) -> List[str]:
        # Tokenizes input string by splitting on whitespace
        ### BEGIN SOLUTION
        return string.split(' ')
        ### END SOLUTION

    def nltkTokenize(self, string: str) -> List[str]:
        # Tokenizes input string using NLTK's word tokenizer
        ### BEGIN SOLUTION
        return word_tokenize(string)
        ### END SOLUTION

    def tokenize(self, string: str) -> List[str]:
        # Tokenizes string and updates vocabulary with unique words
        if self.lowercase:
            string = string.lower()
        tokens = self.basicTokenize(string) if self.type == "basic" else self.nltkTokenize(string)
        self.vocab += [w for w in set(tokens) if w not in self.vocab]
        return tokens

    def countTopWords(self, words: List[str], k: int) -> List[Tuple[str, int]]:
        # Returns the top k most common words
        ### END SOLUTION
        return Counter(words).most_common(k)
        ### END SOLUTION

class BiGramLanguageModel:
    def __init__(self, vocab: List[str], smoothing: str = None, smoothing_param: float = None):
        self.vocab = vocab
        self.token_to_idx = {word: i for i, word in enumerate(vocab)}
        self.smoothing = smoothing
        self.smoothing_param = smoothing_param
        self.bi_counts = None
        self.bi_prob = None
        assert smoothing is None or smoothing_param is not None, "Smoothing parameters must be set correctly."

    def computeBigramProb(self):
        # Computes bigram probabilities without smoothing
        self.bi_prob = self.bi_counts.copy()
        for i in range(len(self.bi_prob)):
            cnt = np.sum(self.bi_prob[i])
            if cnt > 0:
                self.bi_prob[i] /= cnt

    def computeBigramProbAddAlpha(self, alpha: float = 0.001):
        # Computes bigram probabilities using add-alpha smoothing
        ### BEGIN SOLUTION
        self.bi_prob = self.bi_counts.copy()
        for i in range(len(self.bi_prob)):
            cnt = np.sum(self.bi_prob[i])
            self.bi_prob[i] =(self.bi_prob[i] + alpha) / (cnt +  len(self.vocab)*alpha) 
        
        
        ### END SOLUTION

    def train(self, corpus: List[str]):
        # Trains the model on the given corpus
        self.bi_counts = np.zeros((len(self.vocab), len(self.vocab)), dtype=float)
        corpus_indices = [self.token_to_idx[w] for w in corpus]
        for i in range(len(corpus_indices) - 1):
            self.bi_counts[corpus_indices[i]][corpus_indices[i + 1]] += 1
        if self.smoothing == "addAlpha":
            self.computeBigramProbAddAlpha(self.smoothing_param)
        else:
            self.computeBigramProb()

    def test(self, corpus: List[str]) -> float:
        # Calculates and returns the perplexity of the model on the given corpus
        logprob = 0.0
        corpus_indices = [self.token_to_idx[w] for w in corpus]
        for i in range(len(corpus_indices) - 1):
            logprob += np.log(self.bi_prob[corpus_indices[i], corpus_indices[i + 1]])
        logprob /= len(corpus_indices) - 1
        return np.exp(-logprob)

def readCorpus(filename: str, tokenizer: Tokenizer) -> List[str]:
    # Reads and tokenizes the corpus from a file
    ### BEGIN SOLUTION
    with open(filename,'r') as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return tokens
    ### END SOLUTION

def runLanguageModel(train_corpus: List[str], val_corpus: List[str], tokenizer: Tokenizer, smoothing_type: str = None, smoothing_param: float = 0.0) -> Dict[str, float]:
    # Trains and tests the language model, returning key metrics
    lm = BiGramLanguageModel(tokenizer.vocab, smoothing=smoothing_type, smoothing_param=smoothing_param)
    lm.train(train_corpus)
    return {"train_ppl": lm.test(train_corpus), "val_ppl": lm.test(val_corpus)}

[nltk_data] Downloading package punkt to C:\Users\TIEN
[nltk_data]     DOAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Initialize tokenizers with basic and NLTK options, both set to lowercase.
basic_tokenizer = Tokenizer(tokenize_type='basic', lowercase=True)
nltk_tokenizer = Tokenizer(tokenize_type='nltk', lowercase=True)

In [3]:
# Read and tokenize the training and validation corpora using the basic tokenizer.
train_corpus = readCorpus('./data/train.txt', basic_tokenizer)
val_corpus = readCorpus('./data/val.txt', basic_tokenizer)

# Example of using the NLTK tokenizer for comparison (unused in final results).
train_corpus_nltk = readCorpus('./data/train.txt', nltk_tokenizer)
val_corpus_nltk = readCorpus('./data/val.txt', nltk_tokenizer)

In [4]:
# Get top 10 frequent words and counts from train_corpus with basic_tokenizer.
basic_tokenizer.countTopWords(train_corpus, k=10)

[('unk', 61019),
 ('the', 45302),
 ('of', 25379),
 ('and', 18067),
 ('to', 16515),
 ('a', 14371),
 ('in', 14231),
 ('is', 7466),
 ('that', 6484),
 ('for', 6434)]

In [5]:
# Get top 10 frequent words and counts from train_corpus_nltk with nltk_tokenizer.
nltk_tokenizer.countTopWords(train_corpus_nltk, k=10)

[('unk', 61019),
 ('the', 45885),
 ('of', 25427),
 (',', 23570),
 ('and', 18346),
 ('.', 17532),
 ('to', 16606),
 ('a', 14721),
 ('in', 14358),
 ('is', 7702)]

In [6]:
# Run the language model with the basic tokenizer and without smoothing.
runLanguageModel(train_corpus, val_corpus,
                 tokenizer=basic_tokenizer)

  logprob += np.log(self.bi_prob[corpus_indices[i], corpus_indices[i + 1]])


{'train_ppl': 69.87841585436585, 'val_ppl': inf}

In [7]:
# Run the language model with the nltk tokenizer and without smoothing.
runLanguageModel(train_corpus_nltk, val_corpus_nltk,
                 tokenizer=nltk_tokenizer)

  logprob += np.log(self.bi_prob[corpus_indices[i], corpus_indices[i + 1]])


{'train_ppl': 69.68967282823243, 'val_ppl': inf}

In [8]:
# Run the language model with the basic tokenizer and with smoothing.
runLanguageModel(train_corpus, val_corpus,
                 tokenizer=basic_tokenizer, smoothing_type='addAlpha', smoothing_param=10e-5)

  logprob += np.log(self.bi_prob[corpus_indices[i], corpus_indices[i + 1]])


{'train_ppl': 74.15014818058779, 'val_ppl': inf}

In [9]:
# Run the language model with the nltk tokenizer and with smoothing.
runLanguageModel(train_corpus_nltk, val_corpus_nltk,
                 tokenizer=nltk_tokenizer, smoothing_type='addAlpha', smoothing_param=10e-5)

  logprob += np.log(self.bi_prob[corpus_indices[i], corpus_indices[i + 1]])


{'train_ppl': 71.60377294949103, 'val_ppl': inf}