# NLP Preprocessing and Modeling Pipeline
This notebook consolidates the preprocessing pipeline and three language models: Bigram, Trigram, and Neural LSTM.


## 1. Imports and Setup


In [1]:
import re
import html
import string
from typing import List, Optional, Union
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import math
import random
import time
import pandas as pd
from collections import defaultdict, Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import numpy as np

# Ensure necessary NLTK data is available
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("Downloading necessary NLTK resources...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('punkt_tab')



Downloading necessary NLTK resources...


[nltk_data] Error loading punkt: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vogle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vogle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vogle\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vogle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 2. Text Preprocessor
The `TextPreprocessor` class handles cleaning and tokenization.


In [2]:
class TextPreprocessor:
    """
    A robust and professional preprocessing pipeline for NLP tasks.
    Designed to handle IMDB movie reviews for Classical, Neural, and Transformer models.
    """

    def __init__(self, 
                 remove_html: bool = True,
                 lowercase: bool = True,
                 remove_punctuation: bool = False,
                 remove_stopwords: bool = False,
                 lemmatize: bool = False,
                 expand_contractions: bool = True):
        """
        Initialize the pipeline with specific configuration flags.
        
        Args:
            remove_html (bool): Strip HTML tags (e.g., <br />). Default True.
            lowercase (bool): Convert text to lowercase. Default True.
            remove_punctuation (bool): Remove punctuation characters.
            remove_stopwords (bool): Remove standard English stopwords.
            lemmatize (bool): Apply WordNet lemmatization.
            expand_contractions (bool): Expand "isn't" to "is not".
        """
        self.remove_html = remove_html
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.expand_contractions = expand_contractions

        # Pre-load resources to optimize runtime
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.remove("not")
        self.lemmatizer = WordNetLemmatizer()
        
        # Simple contraction map for expansion
        self.contractions_dict = {
            "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
            "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not",
            "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not",
            "can't": "cannot", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not",
            "mustn't": "must not", "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
            "it's": "it is", "we're": "we are", "they're": "they are", "i've": "i have", "you've": "you have",
            "we've": "we have", "they've": "they have", "i'll": "i will", "you'll": "you will",
            "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will"
        }
        self.contractions_re = re.compile('(%s)' % '|'.join(self.contractions_dict.keys()))

    def _clean_html(self, text: str) -> str:
        """Removes HTML tags and unescapes HTML entities."""
        text = html.unescape(text)
        # Regex for HTML tags
        clean = re.compile('<.*?>')
        return re.sub(clean, ' ', text)

    def _expand_contractions(self, text: str) -> str:
        """Expands common English contractions."""
        def replace(match):
            return self.contractions_dict[match.group(0)]
        return self.contractions_re.sub(replace, text)

    def _remove_punct(self, text: str) -> str:
        """
        Removes punctuation by replacing it with spaces.
        This prevents 'word,word' from becoming 'wordword'.
        """
        # Replace punctuation with a space
        return re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)

    def process_text(self, text: str) -> Union[str, List[str]]:
        """
        Main execution method. Applies enabled steps in the logical order.
        
        Returns:
            str: If the final output is a joined string.
            List[str]: If the processing flow ends in tokenization without re-joining.
        """
        if not isinstance(text, str) or not text:
            return ""

        # 1. Cleaning
        if self.remove_html:
            text = self._clean_html(text)
        
        # 2. Lowercasing
        if self.lowercase:
            text = text.lower()
            
        # 3. Expansion (must be after lowercasing for simple dict matching)
        if self.expand_contractions:
            text = self._expand_contractions(text)

        # 4. Punctuation Removal
        if self.remove_punctuation:
            text = self._remove_punct(text)

        # 5. Tokenization
        # We always tokenize to perform word-level operations (stopword/lemma)
        tokens = word_tokenize(text)

        # 6. Stopword Removal
        if self.remove_stopwords:
            tokens = [w for w in tokens if w not in self.stop_words]

        # 7. Lemmatization
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(w) for w in tokens]

        # Return list of tokens or join back to string depending on downstream need.
        # For this pipeline, we generally return the list of tokens for Classical models,
        # but for compatibility, we will join them back into a clean string 
        # because Tokenizers for Transformers/LSTMs often expect string input 
        # and do their own internal splitting.
        
        return " ".join(tokens)

# --- Usage Example / Demonstration ---


## 3. Data Loading
Loading the IMDB dataset.


In [3]:

def load_data(path='IMDB Dataset.csv'):
    try:
        df = pd.read_csv(path)
        print("Dataset loaded successfully.")
        return df['review'].tolist()
    except FileNotFoundError:
        print(f"{path} not found. Trying small dataset...")
        try:
             df = pd.read_csv('IMDB Dataset_small.csv')
             print("IMDB Dataset_small.csv loaded.")
             return df['review'].tolist()
        except FileNotFoundError:
             print("No dataset found. Using dummy data.")
             return ["The movie was terrible.", "I loved the movie."]

reviews = load_data()
print(f"Loaded {len(reviews)} reviews.")



Dataset loaded successfully.
Loaded 50000 reviews.


## 4. Bigram Language Model


In [12]:
class BigramLanguageModel:
    def __init__(self, alpha=0.01):
        """
        Initialize the Bigram Model.
        
        Args:
            alpha (float): The smoothing parameter for Laplace smoothing. 
                           Default is 0.01.
        """
        self.alpha = alpha
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.vocab = set()
        self.vocab_size = 0
        self.total_bigrams = 0
        self.total_unigrams = 0
        
        self.lambda1 = 0.3 # Unigram
        self.lambda2 = 0.7 # Bigram
        
    def train(self, corpus):
        """
        Trains the model on a corpus of tokenized sentences.
        Uses the full vocabulary (no <UNK> thresholding).
        """
        print("Training model on full vocabulary...")
        for sentence in corpus:
            # Update vocabulary and unigram counts
            for word in sentence:
                self.vocab.add(word)
                self.unigram_counts[word] += 1
                self.total_unigrams += 1
            
            # Update bigram counts
            for i in range(len(sentence) - 1):
                w_curr = sentence[i]
                w_next = sentence[i+1]
                self.bigram_counts[w_curr][w_next] += 1
                self.total_bigrams += 1
                
        self.vocab_size = len(self.vocab)
        print(f"Training complete. Vocab size: {self.vocab_size}")

    def get_probability(self, prev_word, word):
        """
        Calculates the interpolated probability P(word | prev_word).
        P = L2 * P(word|prev) + L1 * P(word)
        """
        # 1. Bigram Probability
        bigram_count = self.bigram_counts[prev_word][word]
        unigram_count_prev = self.unigram_counts[prev_word]
        
        p_bi_num = bigram_count + self.alpha
        p_bi_den = unigram_count_prev + (self.alpha * self.vocab_size)
        p_bi = p_bi_num / p_bi_den
        
        # 2. Unigram Probability
        unigram_count_word = self.unigram_counts[word]
        p_uni_num = unigram_count_word + self.alpha
        p_uni_den = self.total_unigrams + (self.alpha * self.vocab_size)
        p_uni = p_uni_num / p_uni_den
        
        return (self.lambda2 * p_bi) + (self.lambda1 * p_uni)

    def calculate_perplexity(self, test_corpus):
        """
        Calculates the perplexity of the model on a test corpus.
        """
        log_prob_sum = 0
        N = 0
        
        for sentence in test_corpus:
            for i in range(len(sentence) - 1):
                w_curr = sentence[i]
                w_next = sentence[i+1]
                
                # We do not replace with <UNK>. If a word is unknown,
                # get_probability handles it via smoothing.
                prob = self.get_probability(w_curr, w_next)
                
                log_prob_sum += math.log2(prob)
                N += 1
        
        if N == 0: return float('inf')
        
        avg_log_prob = -log_prob_sum / N
        perplexity = 2 ** avg_log_prob
        return perplexity

    def generate_sentence(self, max_length=20):
        """
        Generates a random sentence.
        """
        current_word = "<s>"
        sentence = [current_word]
        
        for _ in range(max_length):
            if current_word == "</s>":
                break
                
            # If current_word was never seen in training (e.g. from a user prompt),
            # unigram_count is 0. We fallback to uniform distribution or break.
            # Here we sample from the whole vocab if unknown, or just observed followers if known.
            
            possible_next = self.bigram_counts[current_word]
            
            if not possible_next:
                # Dead end or unknown word. 
                # Ideally: Sample uniformly from V (or weighted by unigrams).
                # For efficiency/simplicity here: break or pick random.
                break 

            candidates = list(possible_next.keys())
            counts = list(possible_next.values())
            
            next_word = random.choices(candidates, weights=counts, k=1)[0]
            
            sentence.append(next_word)
            current_word = next_word
            
        return " ".join(sentence)

    def autocomplete(self, prompt, preprocessor, max_length=20):
        """
        Completes a given text prompt.
        """
        cleaned_prompt = preprocessor.process_text(prompt)
        tokens = cleaned_prompt.split()
        
        if not tokens:
            current_word = "<s>"
        else:
            current_word = tokens[-1]
            
        # Warning: If current_word is not in self.vocab, generation will stop immediately
        # because bigram_counts[current_word] will be empty.
        
        generated_tokens = []
        for _ in range(max_length):
            if current_word == "</s>":
                break
            
            possible_next = self.bigram_counts[current_word]
            
            if not possible_next:
                break
                
            candidates = list(possible_next.keys())
            counts = list(possible_next.values())
            
            next_word = random.choices(candidates, weights=counts, k=1)[0]
            
            generated_tokens.append(next_word)
            current_word = next_word
            
        return prompt + " " + " ".join(generated_tokens)

    def autocomplete(self, prompt, preprocessor, max_length=20):
        """
        Completes a given text prompt using the trained model.
        """
        # Preprocess the prompt to get the last token
        cleaned_prompt = preprocessor.process_text(prompt)
        tokens = cleaned_prompt.split()
        
        if not tokens:
            current_word = "<s>"
        else:
            current_word = tokens[-1]
            
        # Handle OOV for the seed word
        if current_word not in self.vocab:
            # Optionally print a warning or fallback
            current_word = "<UNK>"
            
        # Generate continuation
        generated_tokens = []
        for _ in range(max_length):
            if current_word == "</s>":
                break
            
            possible_next = self.bigram_counts[current_word]
            
            if not possible_next:
                # If we hit a dead end (should be rare with smoothing context, but possible if UNK), replace
                current_word = "<UNK>"
                possible_next = self.bigram_counts[current_word]

            if not possible_next:
                break
                
            candidates = list(possible_next.keys())
            counts = list(possible_next.values())
            
            next_word = random.choices(candidates, weights=counts, k=1)[0]
            
            generated_tokens.append(next_word)
            current_word = next_word
            
        return prompt + " " + " ".join(generated_tokens)

# --- Example Usage with IMDB Data ---

def dummy_preprocessor(text):
    """
    Placeholder for your existing pipeline.
    Ensures <s> and </s> are added and text is tokenized.
    """
    # Simple tokenization for demonstration
    tokens = text.lower().strip().split()
    return ['<s>'] + tokens + ['</s>']

def main():
    # 1. Load Data
    try:
        # Assuming the CSV is in the same directory
        # Using the column names from your screenshot: 'review', 'sentiment'
        df = pd.read_csv('IMDB Dataset.csv')
        print("Dataset loaded successfully.")
        
        # Taking a subset for demonstration speed
        reviews = df['review'].tolist() 
        
    except FileNotFoundError:
        print("IMDB Dataset.csv not found. Using dummy data.")
        reviews = ["The movie was terrible.", "I loved the movie."]

    # 2. Apply Preprocessing (Less aggressive for Language Modeling)
    preprocessor = TextPreprocessor(
        remove_html=True,
        lowercase=True,
        remove_punctuation=False, # Keep punctuation for structure
        remove_stopwords=False,   # Keep stopwords for grammar
        lemmatize=True,          # Keep original word forms
        expand_contractions=True)
        
    tokenized_corpus = []
    print("Preprocessing texts...")
    for r in reviews:
        # Preprocessor returns a single string of space-separated tokens
        cleaned_text = preprocessor.process_text(r)
        # Split into list of tokens
        tokens = cleaned_text.split()
        # Add sentence boundaries
        tokens = ['<s>'] + tokens + ['</s>']
        tokenized_corpus.append(tokens)

    # 3. Split Train/Test
    split_idx = int(len(tokenized_corpus) * 0.8)
    train_data = tokenized_corpus[:split_idx]
    test_data = tokenized_corpus[split_idx:]

    # 4. Initialize and Train
    model = BigramLanguageModel(alpha=0.01) # Reduced alpha
    
    start_time = time.time()
    model.train(train_data)
    end_time = time.time()
    print(f"Time to build model: {end_time - start_time:.4f} seconds")

    # 5. Generate Text
    print("\n--- Generated Reviews ---")
    for _ in range(3):
        print(f"- {model.generate_sentence()}")

    # 6. Evaluate Perplexity
    print("\n--- Evaluation ---")
    pp = model.calculate_perplexity(test_data)
    print(f"Model Perplexity on Test Set: {pp:.2f}")
    
    # 7. Autocomplete Demo
    print("\n--- Autocomplete Demo ---")
    prompts = [
        "The movie was",
        "I really liked",
        "The acting is",
        "This film is a complete"
    ]
    for p in prompts:
        completed = model.autocomplete(p, preprocessor)
        print(f"Prompt: '{p}'\nResult: {completed}\n")


### Train and Evaluate Bigram Model


In [13]:

# Configuration
preprocessor_classic = TextPreprocessor(
    remove_html=True,
    lowercase=True,
    remove_punctuation=True,
    remove_stopwords=False,
    lemmatize=True,
    expand_contractions=True
)

tokenized_corpus_bi = []
print("Preprocessing for Bigram Model...")
# Using a subset for speed in notebook if needed, or full
for r in reviews[:5000]: # Limit for demo speed
    cleaned_text = preprocessor_classic.process_text(r)
    tokens = cleaned_text.split()
    tokens = ['<s>'] + tokens + ['</s>']
    tokenized_corpus_bi.append(tokens)

# Split
split_idx = int(len(tokenized_corpus_bi) * 0.8)
train_bi = tokenized_corpus_bi[:split_idx]
test_bi = tokenized_corpus_bi[split_idx:]

# Train
model_bi = BigramLanguageModel(alpha=0.01)
start_time = time.time()
model_bi.train(train_bi)
print(f"Time to train Bigram: {time.time() - start_time:.4f} seconds")

# Evaluate
pp_bi = model_bi.calculate_perplexity(test_bi)
print(f"Bigram Perplexity: {pp_bi:.2f}")

# Generate
print("- Generated: " + model_bi.generate_sentence())



Preprocessing for Bigram Model...
Training model on full vocabulary...
Training complete. Vocab size: 31602
Time to train Bigram: 0.4484 seconds
Bigram Perplexity: 471.94
- Generated: <s> this is an independent film that these two song written here and humanist he ha touched in mideval france contest


## 5. Trigram Language Model


In [14]:
class TrigramLanguageModel:
    def __init__(self, alpha=0.01):
        """
        Initialize the Trigram Model.
        
        Args:
            alpha (float): The smoothing parameter for Laplace smoothing. 
                           Default is 0.01.
        """
        self.alpha = alpha
        # trigram_counts: count of (w1, w2, w3) aka given w1, w2, what is w3?
        # Structure: dict[(w1, w2)] -> dict[w3] -> count
        self.trigram_counts = defaultdict(lambda: defaultdict(int))
        
        # bigram_counts: count of (w1, w2) as a history.
        # Structure: dict[(w1, w2)] -> count
        self.bigram_counts = defaultdict(int)
        self.unigram_counts = defaultdict(int)
        
        self.vocab = set()
        self.vocab_size = 0
        self.total_trigrams = 0
        self.total_unigrams = 0
        
        # Interpolation weights
        self.lambda1 = 0.1 # Unigram
        self.lambda2 = 0.3 # Bigram
        self.lambda3 = 0.6 # Trigram
        
    def train(self, corpus):
        """
        Trains the model on a corpus of tokenized sentences.
        Uses the full vocabulary (no <UNK> thresholding).
        """
        print("Training model on full vocabulary...")
        for sentence in corpus:
            # Update vocabulary and unigram counts
            for word in sentence:
                self.vocab.add(word)
                self.unigram_counts[word] += 1
                self.total_unigrams += 1
            
            # Update bigram counts (for backoff)
            for i in range(len(sentence) - 1):
                self.bigram_counts[(sentence[i], sentence[i+1])] += 1

            # Update trigram counts
            # Sentence is expected to be padded like ['<s>', '<s>', 'w1', ..., 'wn', '</s>']
            for i in range(len(sentence) - 2):
                w_1 = sentence[i]
                w_2 = sentence[i+1]
                w_3 = sentence[i+2]
                
                self.trigram_counts[(w_1, w_2)][w_3] += 1
                self.total_trigrams += 1
                
        self.vocab_size = len(self.vocab)
        print(f"Training complete. Vocab size: {self.vocab_size}")

    def get_probability(self, w_1, w_2, w_3):
        """
        Calculates the interpolated probability P(w_3 | w_1, w_2).
        P = L3 * P(w3|w1,w2) + L2 * P(w3|w2) + L1 * P(w3)
        """
        # 1. Trigram Probability
        trigram_count = self.trigram_counts[(w_1, w_2)][w_3]
        bigram_context_count = self.bigram_counts[(w_1, w_2)]
        
        p_tri_num = trigram_count + self.alpha
        p_tri_den = bigram_context_count + (self.alpha * self.vocab_size)
        p_tri = p_tri_num / p_tri_den
        
        # 2. Bigram Probability (Backoff)
        bigram_count = self.bigram_counts[(w_2, w_3)]
        unigram_context_count = self.unigram_counts[w_2]
        
        p_bi_num = bigram_count + self.alpha
        p_bi_den = unigram_context_count + (self.alpha * self.vocab_size)
        p_bi = p_bi_num / p_bi_den
        
        # 3. Unigram Probability
        unigram_count = self.unigram_counts[w_3]
        p_uni_num = unigram_count + self.alpha
        p_uni_den = self.total_unigrams + (self.alpha * self.vocab_size)
        p_uni = p_uni_num / p_uni_den
        
        return (self.lambda3 * p_tri) + (self.lambda2 * p_bi) + (self.lambda1 * p_uni)

    def calculate_perplexity(self, test_corpus):
        """
        Calculates the perplexity of the model on a test corpus.
        """
        log_prob_sum = 0
        N = 0
        
        for sentence in test_corpus:
            for i in range(len(sentence) - 2):
                w_1 = sentence[i]
                w_2 = sentence[i+1]
                w_3 = sentence[i+2]
                
                prob = self.get_probability(w_1, w_2, w_3)
                
                log_prob_sum += math.log2(prob)
                N += 1
        
        if N == 0: return float('inf')
        
        avg_log_prob = -log_prob_sum / N
        perplexity = 2 ** avg_log_prob
        return perplexity

    def generate_sentence(self, max_length=20):
        """
        Generates a random sentence.
        """
        # Start with two padding tokens
        current_w1 = "<s>"
        current_w2 = "<s>"
        sentence = [current_w1, current_w2]
        
        for _ in range(max_length):
            # If we generated the end token, stop
            if current_w2 == "</s>":
                break
                
            possible_next = self.trigram_counts[(current_w1, current_w2)]
            
            if not possible_next:
                # If unknown history, we can't progress. 
                break 

            candidates = list(possible_next.keys())
            counts = list(possible_next.values())
            
            next_word = random.choices(candidates, weights=counts, k=1)[0]
            
            sentence.append(next_word)
            current_w1 = current_w2
            current_w2 = next_word
            
        # Return joined sentence, removing start tokens
        # Typically we don't show <s> <s>
        # The list has ['<s>', '<s>', 'word1', ... '</s>' maybe]
        # We can strip the first two <s>
        return " ".join(sentence[2:])

    def autocomplete(self, prompt, preprocessor, max_length=20):
        """
        Completes a given text prompt using the trained model.
        """
        cleaned_prompt = preprocessor.process_text(prompt)
        tokens = cleaned_prompt.split()
        
        # Determine context words (need 2)
        if len(tokens) >= 2:
            current_w1 = tokens[-2]
            current_w2 = tokens[-1]
        elif len(tokens) == 1:
            current_w1 = "<s>"
            current_w2 = tokens[-1]
        else:
            current_w1 = "<s>"
            current_w2 = "<s>"
            
        # Handle OOV - simplistic approach, similar to bigram fallbacks could be added, 
        # but here we rely on smoothing or break if empty.
        
        generated_tokens = []
        for _ in range(max_length):
            if current_w2 == "</s>":
                break
            
            possible_next = self.trigram_counts[(current_w1, current_w2)]
            
            if not possible_next:
                # If we dead end, we could maybe try fallback to bigram?
                # But for strict trigram implementation request:
                break
                
            candidates = list(possible_next.keys())
            counts = list(possible_next.values())
            
            next_word = random.choices(candidates, weights=counts, k=1)[0]
            
            generated_tokens.append(next_word)
            current_w1 = current_w2
            current_w2 = next_word
            
        return prompt + " " + " ".join(generated_tokens)


# --- Example Usage with IMDB Data ---

def main():
    # 1. Load Data
    try:
        df = pd.read_csv('IMDB Dataset.csv')
        print("Dataset loaded successfully.")
        reviews = df['review'].tolist() 
    except FileNotFoundError:
        print("IMDB Dataset.csv not found. Using dummy data or trying small.")
        try:
             df = pd.read_csv('IMDB Dataset_small.csv')
             print("IMDB Dataset_small.csv loaded.")
             reviews = df['review'].tolist()
        except FileNotFoundError:
             reviews = ["The movie was terrible.", "I loved the movie."]

    # 2. Apply Preprocessing
    preprocessor = TextPreprocessor(
        remove_html=True,
        lowercase=True,
        remove_punctuation=False,
        remove_stopwords=False,
        lemmatize=False,
        expand_contractions=True)
        
    tokenized_corpus = []
    print("Preprocessing texts...")
    for r in reviews:
        cleaned_text = preprocessor.process_text(r)
        tokens = cleaned_text.split()
        # Trigram needs two start tokens to have context for the first real word
        tokens = ['<s>', '<s>'] + tokens + ['</s>']
        tokenized_corpus.append(tokens)

    # 3. Split Train/Test
    split_idx = int(len(tokenized_corpus) * 0.8)
    train_data = tokenized_corpus[:split_idx]
    test_data = tokenized_corpus[split_idx:]

    # 4. Initialize and Train
    model = TrigramLanguageModel(alpha=0.01)
    
    start_time = time.time()
    model.train(train_data)
    end_time = time.time()
    print(f"Time to build model: {end_time - start_time:.4f} seconds")

    # 5. Generate Text
    print("\n--- Generated Reviews ---")
    for _ in range(3):
        # We might generate '</s>' at the end, which generate_sentence returns.
        print(f"- {model.generate_sentence()}")

    # 6. Evaluate Perplexity
    print("\n--- Evaluation ---")
    pp = model.calculate_perplexity(test_data)
    print(f"Model Perplexity on Test Set: {pp:.2f}")
    
    # 7. Autocomplete Demo
    print("\n--- Autocomplete Demo ---")
    prompts = [
        "The movie was",
        "I really liked",
        "The acting is",
        "This film is a complete"
    ]
    for p in prompts:
        completed = model.autocomplete(p, preprocessor)
        print(f"Prompt: '{p}'\nResult: {completed}\n")


### Train and Evaluate Trigram Model


In [15]:

# Reuse preprocessor but Trigram needs different start tokens usually? 
# The script logic: "tokens = ['<s>', '<s>'] + tokens + ['</s>']"

tokenized_corpus_tri = []
print("Preprocessing for Trigram Model...")
for r in reviews[:5000]: 
    cleaned_text = preprocessor_classic.process_text(r)
    tokens = cleaned_text.split()
    tokens = ['<s>', '<s>'] + tokens + ['</s>']
    tokenized_corpus_tri.append(tokens)

# Split
split_idx = int(len(tokenized_corpus_tri) * 0.8)
train_tri = tokenized_corpus_tri[:split_idx]
test_tri = tokenized_corpus_tri[split_idx:]

# Train
model_tri = TrigramLanguageModel(alpha=0.01)
start_time = time.time()
model_tri.train(train_tri)
print(f"Time to train Trigram: {time.time() - start_time:.4f} seconds")

# Evaluate
pp_tri = model_tri.calculate_perplexity(test_tri)
print(f"Trigram Perplexity: {pp_tri:.2f}")

# Generate
print("- Generated: " + model_tri.generate_sentence())



Preprocessing for Trigram Model...
Training model on full vocabulary...
Training complete. Vocab size: 31602
Time to train Trigram: 1.6140 seconds
Trigram Perplexity: 670.17
- Generated: i think this should forever be a good a usual loved omar epps ice cube and jennifer 8 showcased an


## 6. Neural Language Model (LSTM)


### Configuration


In [8]:

EMBED_DIM = 64
HIDDEN_DIM = 512
BATCH_SIZE = 64
LEARNING_RATE = 0.005
NUM_EPOCHS = 3 # Reduced for notebook execution speed
MAX_SEQ_LEN = 100



In [9]:
class Vocabulary:
    def __init__(self, token_to_idx=None):
        if token_to_idx:
            self.token_to_idx = token_to_idx
        else:
            self.token_to_idx = {"<PAD>": 0, "<UNK>": 1, "<s>": 2, "</s>": 3}
        self.idx_to_token = {v: k for k, v in self.token_to_idx.items()}
        
    def build_vocab(self, sentences, min_freq=2):
        print("Building vocabulary...")
        all_tokens = [token for sent in sentences for token in sent]
        counts = Counter(all_tokens)
        
        for token, count in counts.items():
            if count >= min_freq and token not in self.token_to_idx:
                self.token_to_idx[token] = len(self.token_to_idx)
                
        self.idx_to_token = {v: k for k, v in self.token_to_idx.items()}
        print(f"Vocabulary size: {len(self.token_to_idx)}")
        
    def __len__(self):
        return len(self.token_to_idx)
    
    def stoi(self, token):
        return self.token_to_idx.get(token, self.token_to_idx["<UNK>"])
        
    def itos(self, idx):
        return self.idx_to_token.get(idx, "<UNK>")

class IMDBDataset(Dataset):
    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab
        
    def __len__(self):
        return len(self.sentences)
        
    def __getitem__(self, idx):
        tokenized_sent = self.sentences[idx]
        # Numericalize
        indexed = [self.vocab.stoi(t) for t in tokenized_sent]
        return torch.tensor(indexed, dtype=torch.long)

def collate_fn(batch):
    """
    Custom collate function to handle variable length sentences via padding.
    """
    # batch is a list of tensors
    # Sort by length (descending) for pack_padded_sequence
    batch.sort(key=lambda x: len(x), reverse=True)
    
    # Separate source and target
    # Source: <s> w1 w2 ... wn
    # Target: w1 w2 ... wn </s>
    # Actually, our sentences in 'sentences' list usually have <s> and </s> already.
    # So we just take :-1 as input and 1: as target.
    
    inputs = [item[:-1] for item in batch]
    targets = [item[1:] for item in batch]
    
    lengths = torch.tensor([len(x) for x in inputs], dtype=torch.long)
    
    # Pad sequences
    # padding_value=0 is <PAD>
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    
    return inputs_padded, targets_padded, lengths

class NeuralLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(NeuralLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, lengths=None, hidden=None):
        # x: (batch, seq_len)
        embed = self.embedding(x) # (batch, seq_len, embed_dim)
        
        if lengths is not None:
            # Pack
            packed_embed = pack_padded_sequence(embed, lengths.cpu(), batch_first=True, enforce_sorted=True)
            packed_out, hidden = self.lstm(packed_embed, hidden)
            # Unpack
            output, _ = pad_packed_sequence(packed_out, batch_first=True)
        else:
            # No packing (e.g. inference)
            output, hidden = self.lstm(embed, hidden)
            
        # output: (batch, seq_len, hidden_dim) (padded where needed)
        
        logits = self.fc(output) # (batch, seq_len, vocab_size)
        return logits, hidden

def generate_text(model, vocab, start_prompt="The movie", max_len=20, device='cpu', temperature=1.0):
    model.eval()
    preprocessor = TextPreprocessor(lowercase=True)
    tokens = preprocessor.process_text(start_prompt).split()
    
    current_idx = [vocab.stoi(t) for t in tokens]
    # Add start token if not present logic? 
    # The model trained on <s>... so prompt should ideally start with something logical.
    # If we feed "The movie", it's mid-sentence-ish.
    
    input_seq = torch.tensor(current_idx, dtype=torch.long).unsqueeze(0).to(device) # (1, seq_len)
    
    generated = list(tokens)
    
    hidden = None
    
    with torch.no_grad():
        for _ in range(max_len):
            logits, hidden = model(input_seq, hidden=hidden)
            
            # Get last time step
            last_logits = logits[0, -1, :]
            
            # Apply temperature
            if temperature != 1.0:
                last_logits = last_logits / temperature
                
            probs = torch.softmax(last_logits, dim=0)
            
            # Sample
            next_token_idx = torch.multinomial(probs, 1).item()
            next_token = vocab.itos(next_token_idx)
            
            if next_token == "</s>":
                break
                
            generated.append(next_token)
            
            # Next input is the single token we just generated (feeding back one by one)
            # Or we could feed the whole sequence, but feeding 1 is efficient IF we keep hidden state.
            input_seq = torch.tensor([[next_token_idx]], dtype=torch.long).to(device)
            
    return " ".join(generated)


### Train Neural Model


In [11]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Preprocess for Neural
preprocessor_neural = TextPreprocessor(
    remove_html=True,
    lowercase=True,
    expand_contractions=True,
    remove_punctuation=False
)

print("Preprocessing for Neural Model...")
tokenized_sentences_neural = []
for r in reviews[:2000]: # Smaller subset for Neural training in notebook
    txt = preprocessor_neural.process_text(r)
    toks = txt.split()
    toks = ['<s>'] + toks[:MAX_SEQ_LEN] + ['</s>']
    tokenized_sentences_neural.append(toks)

# Vocab
vocab = Vocabulary()
vocab.build_vocab(tokenized_sentences_neural, min_freq=2)

# Dataset
dataset = IMDBDataset(tokenized_sentences_neural, vocab)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Model
model_neural = NeuralLM(len(vocab), EMBED_DIM, HIDDEN_DIM).to(device)
optimizer = optim.Adam(model_neural.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Loop
model_neural.train()
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for inputs, targets, lengths in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        logits, _ = model_neural(inputs, lengths)
        loss = criterion(logits.view(-1, len(vocab)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(dataloader):.4f}")

# Generate
print("Generated via Neural:")
print(generate_text(model_neural, vocab, "The movie was", device=device))



Using device: cuda
Preprocessing for Neural Model...
Building vocabulary...
Vocabulary size: 7619
Epoch 1/3, Loss: 6.4789
Epoch 2/3, Loss: 5.6727
Epoch 3/3, Loss: 5.2935
Generated via Neural:
the movie was meant to death .
