<a href="https://colab.research.google.com/github/ShovalBenjer/Natural_Language_Proccessing_NLP_Projects/blob/main/LSTM_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install dependencies as needed:
!pip install kagglehub[pandas-datasets] torch torchvision torchaudio plotnine tqdm scikit-learn
import kagglehub
from kagglehub import KaggleDatasetAdapter

import pandas as pd
import numpy as np
import string
import os
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

from tqdm.auto import tqdm # For progress bars

# For plotting
from plotnine import ggplot, aes, geom_line, labs, theme_minimal

# For reproducibility
import random
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
# --- 1. Load Data ---
print("Loading dataset...")
# Let's pick one of the comment files. If file_path is empty, it loads CommentsApril2017.csv by default.
# For a more robust solution, you might want to load and concatenate multiple files.
# For this example, we'll use the default or specify one for clarity.
# file_path = "CommentsApril2017.csv" # Example specific file
file_path = "" # Let KaggleHub pick the default for simplicity

try:
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      "aashita/nyt-comments",
      file_path=file_path,
    )
    print("Dataset loaded successfully.")
    print("First 5 records:", df.head())
    # Select the comment text column
    # Based on typical Kaggle datasets, it might be 'commentBody' or similar
    # Let's inspect columns if 'commentBody' isn't present
    if 'commentBody' in df.columns:
        corpus_raw = df['commentBody'].dropna().astype(str).tolist()
    elif 'commentText' in df.columns:
        corpus_raw = df['commentText'].dropna().astype(str).tolist()
    else:
        # Fallback: try to find a text-like column
        text_cols = [col for col in df.columns if df[col].dtype == 'object']
        if not text_cols:
            raise ValueError("No suitable text column found in the dataset.")
        print(f"Warning: 'commentBody' or 'commentText' not found. Using first object column: {text_cols[0]}")
        corpus_raw = df[text_cols[0]].dropna().astype(str).tolist()

except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Using dummy data for demonstration purposes.")
    corpus_raw = [
        "This is the first comment about politics.",
        "I agree with the previous sentiment.",
        "Another comment regarding the current events.",
        "Let's talk about technology and AI.",
        "The future of AI is fascinating and scary."
    ]

# For faster demonstration, let's use a smaller subset
MAX_SAMPLES = 5000  # Adjust as needed for your hardware
if len(corpus_raw) > MAX_SAMPLES:
    print(f"Using a subset of {MAX_SAMPLES} comments for faster processing.")
    corpus_raw = random.sample(corpus_raw, MAX_SAMPLES)
else:
    print(f"Using all {len(corpus_raw)} available comments.")

Loading dataset...
Error loading dataset: load_dataset() got an unexpected keyword argument 'file_path'
Using dummy data for demonstration purposes.
Using all 5 available comments.


In [4]:
# --- 2. Preprocess Data ---
print("\nPreprocessing data...")

def clean_text(txt):
    txt = txt.lower()
    # Remove URLs
    txt = re.sub(r'http\S+|www\S+|https\S+', '', txt, flags=re.MULTILINE)
    # Remove user @ references and #hashtags
    txt = re.sub(r'\@\w+|\#','', txt)
    # Remove punctuation
    txt = "".join(v for v in txt if v not in string.punctuation.replace("'", "")) # Keep apostrophes for now
    txt = txt.replace("’", "'") # Normalize apostrophes
    # Remove numbers (optional, depends on whether you want numbers in your LM)
    # txt = re.sub(r'\d+', '', txt)
    # Remove emojis (a basic way, might need a more comprehensive library for full coverage)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    txt = emoji_pattern.sub(r'', txt)
    txt = txt.encode("utf8").decode("ascii", 'ignore') # Handle residual non-ascii
    txt = re.sub(r'\s+', ' ', txt).strip() # Remove extra whitespace
    return txt

corpus_cleaned = [clean_text(text) for text in tqdm(corpus_raw, desc="Cleaning text") if clean_text(text)]
corpus_cleaned = [text for text in corpus_cleaned if len(text.split()) > 2] # Keep sentences with at least 3 words

print(f"Sample cleaned comment: {corpus_cleaned[0] if corpus_cleaned else 'N/A'}")

# Tokenization and Vocabulary
class Tokenizer:
    def __init__(self, oov_token="<unk>"):
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.oov_token = oov_token
        self.add_word(oov_token) # Add OOV token

    def fit_on_texts(self, texts):
        word_counts = {}
        for text in texts:
            for word in text.split():
                word_counts[word] = word_counts.get(word, 0) + 1

        # Sort words by frequency for consistent indexing (optional but good practice)
        sorted_words = sorted(word_counts.keys(), key=lambda x: word_counts[x], reverse=True)

        for word in sorted_words:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word_to_idx:
            idx = len(self.word_to_idx)
            self.word_to_idx[word] = idx
            self.idx_to_word[idx] = word

    def texts_to_sequences(self, texts):
        sequences = []
        oov_idx = self.word_to_idx[self.oov_token]
        for text in texts:
            seq = [self.word_to_idx.get(word, oov_idx) for word in text.split()]
            sequences.append(seq)
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for seq in sequences:
            words = [self.idx_to_word.get(idx, self.oov_token) for idx in seq]
            texts.append(" ".join(words))
        return texts

    @property
    def vocab_size(self):
        return len(self.word_to_idx)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_cleaned)
total_words = tokenizer.vocab_size
print(f"Total unique words in vocabulary: {total_words}")

# Generate N-gram sequences
input_sequences = []
for line in tqdm(corpus_cleaned, desc="Generating n-gram sequences"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    if not token_list: continue # Skip empty lines after tokenization
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

if not input_sequences:
    raise ValueError("No input sequences generated. Check corpus cleaning and tokenization, or increase MAX_SAMPLES.")

print(f"Number of n-gram sequences: {len(input_sequences)}")
max_sequence_len = max([len(x) for x in input_sequences])
print(f"Max sequence length: {max_sequence_len}")

# Pad sequences and create predictors/labels
# Convert to PyTorch tensors directly
predictors_list = []
labels_list = []

for seq in input_sequences:
    predictors_list.append(torch.tensor(seq[:-1], dtype=torch.long))
    labels_list.append(torch.tensor(seq[-1], dtype=torch.long))

# Pad predictor sequences
# The input to LSTM expects (seq_len, batch, input_size) or (batch, seq_len, input_size) if batch_first=True
# Embedding layer will take care of converting token indices to vectors.
# We need to pad the sequences of token indices.
# `pad_sequence` expects a list of Tensors and pads them to the length of the longest Tensor.
# It pads with 0 by default, which is fine if 0 is not a valid token index or if we use padding_idx in Embedding.
# Our tokenizer starts indexing from 0, so 0 is a valid token (<unk>).
# We can add a <pad> token or use `padding_value` if necessary. Let's assume 0 for <unk> is okay for now.
# Or, better, make tokenizer reserve 0 for padding. Let's adjust tokenizer slightly.

class TokenizerWithPadding(Tokenizer):
    def __init__(self, oov_token="<unk>", pad_token="<pad>"):
        super().__init__(oov_token)
        self.pad_token = pad_token
        # Ensure pad token is 0, and OOV is 1, then actual words
        self.word_to_idx = {pad_token: 0, oov_token: 1}
        self.idx_to_word = {0: pad_token, 1: oov_token}
        self.pad_idx = 0
        self.oov_idx = 1
        self._next_idx = 2 # Next available index for new words

    def add_word(self, word):
        if word not in self.word_to_idx:
            self.word_to_idx[word] = self._next_idx
            self.idx_to_word[self._next_idx] = word
            self._next_idx += 1

    @property
    def vocab_size(self):
        return self._next_idx # Correct vocab size calculation

# Re-tokenize with padding-aware tokenizer
tokenizer = TokenizerWithPadding()
tokenizer.fit_on_texts(corpus_cleaned)
total_words = tokenizer.vocab_size # This is actually vocab_size + 1 if we count padding
print(f"Total unique words in vocabulary (incl. <pad>, <unk>): {total_words}")

input_sequences = []
for line in tqdm(corpus_cleaned, desc="Re-generating n-gram sequences"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Filter out sequences that are only <unk> or too short
    if not token_list or all(t == tokenizer.oov_idx for t in token_list): continue
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        # Ensure n_gram_sequence has at least one non-pad, non-unk token before the target
        # This means the predictor part (n_gram_sequence[:-1]) should not be empty
        if len(n_gram_sequence[:-1]) > 0:
            input_sequences.append(n_gram_sequence)


if not input_sequences:
    raise ValueError("No input sequences generated after re-tokenization. Check data or sampling.")

max_sequence_len_model_input = max([len(x[:-1]) for x in input_sequences if len(x[:-1]) > 0])
if max_sequence_len_model_input == 0:
    raise ValueError("Max sequence length for predictors is 0. All n-grams might be too short.")
print(f"Max predictor sequence length: {max_sequence_len_model_input}")


predictors_list = []
labels_list = []
for seq in input_sequences:
    predictor_part = seq[:-1]
    label_part = seq[-1]
    if len(predictor_part) > 0: # Ensure predictor is not empty
        predictors_list.append(torch.tensor(predictor_part, dtype=torch.long))
        labels_list.append(torch.tensor(label_part, dtype=torch.long))

# Pad predictor sequences
# `batch_first=True` means pad_sequence will output (batch_size, max_len)
padded_predictors = pad_sequence(predictors_list, batch_first=True, padding_value=tokenizer.pad_idx)
labels_tensor = torch.stack(labels_list) # Stack labels into a single tensor

print(f"Shape of padded predictors: {padded_predictors.shape}")
print(f"Shape of labels: {labels_tensor.shape}")


Preprocessing data...


Cleaning text:   0%|          | 0/5 [00:00<?, ?it/s]

Sample cleaned comment: this is the first comment about politics
Total unique words in vocabulary: 26


Generating n-gram sequences:   0%|          | 0/5 [00:00<?, ?it/s]

Number of n-gram sequences: 28
Max sequence length: 8


AttributeError: 'TokenizerWithPadding' object has no attribute '_next_idx'

In [None]:
# --- 3. Create PyTorch Dataset and DataLoader ---
class CommentDataset(Dataset):
    def __init__(self, predictors, labels):
        self.predictors = predictors
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.predictors[idx], self.labels[idx]

dataset = CommentDataset(padded_predictors, labels_tensor)

# Split into training and validation (optional, but good practice)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
if val_size == 0 and train_size > 0 : # handle tiny datasets for testing
    train_dataset = dataset
    val_dataset = dataset # use train as val if val_size is 0
else:
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


BATCH_SIZE = 128 # Can be tuned
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
if val_size > 0:
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
else: # if val_size is 0, val_loader can be None or point to train_loader for simplicity in eval loop
    val_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)


# --- 4. Define LSTM Model in PyTorch ---
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, padding_idx):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, seq_len)
        embedded = self.embedding(x)
        # embedded shape: (batch_size, seq_len, embedding_dim)

        lstm_out, _ = self.lstm(embedded)
        # lstm_out shape: (batch_size, seq_len, hidden_dim)

        # We only want the output from the last time step
        last_lstm_out = lstm_out[:, -1, :]
        # last_lstm_out shape: (batch_size, hidden_dim)

        out = self.dropout(last_lstm_out)
        out = self.fc(out)
        # out shape: (batch_size, vocab_size)
        return out

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 150
NUM_LAYERS = 2 # Using 2 LSTM layers
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.001 # Adam's default is 0.001
EPOCHS = 20 # Start with a smaller number for quick testing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = LanguageModel(
    vocab_size=total_words,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout_rate=DROPOUT_RATE,
    padding_idx=tokenizer.pad_idx
).to(device)

print(model)
# Sanity check one batch
try:
    sample_x, sample_y = next(iter(train_loader))
    sample_x, sample_y = sample_x.to(device), sample_y.to(device)
    output = model(sample_x)
    print("Sample output shape:", output.shape) # Expected: (BATCH_SIZE, total_words)
    print("Sample target shape:", sample_y.shape) # Expected: (BATCH_SIZE)
except StopIteration:
    print("Train loader is empty. Cannot perform sanity check.")
    # This can happen if MAX_SAMPLES is too small or data cleaning is too aggressive.

# --- 5. Training Loop ---
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_idx) # Ignore padding in loss calculation
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

history = {'epoch': [], 'train_loss': [], 'val_loss': [], 'perplexity': []}

print("\nStarting training...")
if not train_loader:
    print("Skipping training as no data is available in train_loader.")
else:
    for epoch in range(EPOCHS):
        model.train()
        epoch_train_loss = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]", leave=False)
        for batch_predictors, batch_labels in progress_bar:
            batch_predictors, batch_labels = batch_predictors.to(device), batch_labels.to(device)

            optimizer.zero_grad()
            outputs = model(batch_predictors)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_train_loss = epoch_train_loss / len(train_loader)
        history['epoch'].append(epoch + 1)
        history['train_loss'].append(avg_train_loss)

        # Validation
        model.eval()
        epoch_val_loss = 0
        if val_loader and len(val_loader) > 0: # Check if val_loader is not empty
            with torch.no_grad():
                progress_bar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Validation]", leave=False)
                for batch_predictors, batch_labels in progress_bar_val:
                    batch_predictors, batch_labels = batch_predictors.to(device), batch_labels.to(device)
                    outputs = model(batch_predictors)
                    loss = criterion(outputs, batch_labels)
                    epoch_val_loss += loss.item()
                    progress_bar_val.set_postfix(loss=loss.item())
            avg_val_loss = epoch_val_loss / len(val_loader)
            perplexity = np.exp(avg_val_loss) # Perplexity = e^(avg_cross_entropy_loss)
        else: # Handle case with no validation data
            avg_val_loss = float('nan')
            perplexity = float('nan')

        history['val_loss'].append(avg_val_loss)
        history['perplexity'].append(perplexity)

        print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f} - Perplexity: {perplexity:.2f}")

In [1]:
# --- 6. Text Generation Function ---
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len_model_input, device, temperature=1.0):
    model.eval()
    generated_text = seed_text

    for _ in range(next_words):
        # Tokenize current text
        token_list = tokenizer.texts_to_sequences([generated_text.lower()])[0]

        # Pad sequence (take only the last `max_sequence_len_model_input` tokens)
        if len(token_list) > max_sequence_len_model_input:
            token_list = token_list[-max_sequence_len_model_input:]

        # Convert to tensor and add batch dimension
        input_tensor = torch.tensor([token_list], dtype=torch.long).to(device)

        with torch.no_grad():
            output = model(input_tensor) # Shape: (1, vocab_size)

        # Apply temperature to logits
        output_dist = output.data.view(-1).div(temperature).exp()

        # Sample from the distribution or take argmax
        # For more diverse generation, use sampling:
        # top_i = torch.multinomial(output_dist, 1)[0]

        # For deterministic (but potentially repetitive) generation, use argmax:
        if temperature == 0.0: # Pure argmax
            predicted_idx = torch.argmax(output, dim=1).item()
        else: # Sampling with temperature
            predicted_idx = torch.multinomial(output_dist, 1)[0].item()


        # Handle OOV or PAD prediction if they occur (less likely with good training)
        if predicted_idx == tokenizer.pad_idx and tokenizer.pad_token != "<pad_is_word>": # Avoid predicting padding
            # Fallback: predict the next most likely non-pad token
            sorted_preds = torch.argsort(output, dim=1, descending=True)
            for idx_val in sorted_preds[0]:
                if idx_val.item() != tokenizer.pad_idx:
                    predicted_idx = idx_val.item()
                    break

        output_word = tokenizer.idx_to_word.get(predicted_idx, tokenizer.oov_token)

        if output_word == tokenizer.oov_token: # Don't append <unk> if it's just a placeholder
            continue # Or break, or try another prediction

        generated_text += " " + output_word

    return generated_text.title() # Capitalize like the Keras example

print("\n--- Text Generation Examples ---")
if not train_loader:
    print("Skipping generation as model was not trained.")
else:
    try:
        seed1 = "the president said"
        seed2 = "new york is"
        seed3 = "climate change will"

        print(f"Seed: '{seed1}'")
        print("Generated (temp=0.7):", generate_text(seed1, 10, model, tokenizer, max_sequence_len_model_input, device, temperature=0.7))
        print("Generated (temp=1.0):", generate_text(seed1, 10, model, tokenizer, max_sequence_len_model_input, device, temperature=1.0))
        print("Generated (argmax):", generate_text(seed1, 10, model, tokenizer, max_sequence_len_model_input, device, temperature=0.0)) # temperature 0 for argmax

        print(f"\nSeed: '{seed2}'")
        print("Generated (temp=0.7):", generate_text(seed2, 10, model, tokenizer, max_sequence_len_model_input, device, temperature=0.7))

        print(f"\nSeed: '{seed3}'")
        print("Generated (temp=0.7):", generate_text(seed3, 10, model, tokenizer, max_sequence_len_model_input, device, temperature=0.7))
    except Exception as e:
        print(f"Error during text generation: {e}")
        print("This might be due to a very small vocabulary or issues with sequence lengths.")


# --- 7. Evaluation (Perplexity is already calculated during training) ---
# Qualitative evaluation is looking at the generated text.
# Quantitative:
# - Perplexity on a test set (calculated as exp(average cross-entropy loss on test set)).
# - BLEU scores if comparing to reference continuations (more for machine translation but adaptable).

print("\n--- Evaluation ---")
print("Perplexity on the validation set is tracked during training.")
print("Final validation perplexity (if available):", f"{history['perplexity'][-1]:.2f}" if history['perplexity'] and not np.isnan(history['perplexity'][-1]) else "N/A")
print("Qualitative evaluation: Inspect the generated text samples above.")


# --- 8. Plotting with plotnine ---
print("\n--- Plotting Training Progress ---")
if history['epoch']: # Check if history has data
    history_df = pd.DataFrame(history)

    # Plot training and validation loss
    loss_plot = (
        ggplot(history_df.melt(id_vars=['epoch'], value_vars=['train_loss', 'val_loss'], var_name='metric', value_name='loss'),
               aes(x='epoch', y='loss', color='metric')) +
        geom_line() +
        labs(title="Training and Validation Loss", x="Epoch", y="Loss") +
        theme_minimal()
    )
    print(loss_plot)

    # Plot perplexity
    if 'perplexity' in history_df.columns and history_df['perplexity'].notna().any():
        perplexity_plot = (
            ggplot(history_df[history_df['perplexity'].notna()], # Filter out NaN perplexity if val_loader was empty
                   aes(x='epoch', y='perplexity')) +
            geom_line(color="blue") +
            labs(title="Validation Perplexity", x="Epoch", y="Perplexity") +
            theme_minimal()
        )
        print(perplexity_plot)
else:
    print("No training history to plot (e.g., training was skipped).")

print("\nModel training and evaluation complete.")
print("To improve: train for more epochs, use more data, tune hyperparameters (embedding_dim, hidden_dim, layers, dropout, learning rate), or try a character-level model for very large vocabularies.")

  df = kagglehub.load_dataset(


ValueError: Unsupported file extension: ''. Supported file extensions are: .csv, .tsv, .json, .jsonl, .xml, .parquet, .feather, .sqlite, .sqlite3, .db, .db3, .s3db, .dl3, .xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt