<a href="https://colab.research.google.com/github/Oksana0020/DL-with-PyTorch/blob/main/Lab10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 1: Data Preparation

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer


In [8]:
from datasets import load_dataset

dataset = load_dataset("imdb")

train_texts = dataset["train"]["text"][:2000]
train_labels = dataset["train"]["label"][:2000]

test_texts = dataset["test"]["text"][:500]
test_labels = dataset["test"]["label"][:500]


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
# Using a Hugging Face tokenizer for consistency and simplicity
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
max_len = 128 # Reduced max_len for smaller dataset and faster processing

# Text processing function - now uses the Hugging Face tokenizer
def process_text_hf(text, tokenizer, max_len):
    # This will pad and truncate automatically
    encoding = tokenizer(
        text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoding['input_ids'].squeeze(0) # Remove batch dimension

# Custom Dataset
class SimplifiedIMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = []
        for text, label in zip(texts, labels):
            text_tensor = process_text_hf(text, self.tokenizer, self.max_len)
            label_tensor = torch.tensor([float(label)], dtype=torch.float)
            self.data.append((text_tensor, label_tensor))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create datasets
train_dataset = SimplifiedIMDBDataset(train_texts, train_labels, tokenizer, max_len)
test_dataset = SimplifiedIMDBDataset(test_texts, test_labels, tokenizer, max_len)

# Create dataloaders
batch_size = 2 # Reduced batch size for very small dataset
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Determine vocab_size and pad_idx from the Hugging Face tokenizer
# Note: For LSTM, we would typically build a custom vocab if not using pretrained embeddings
# For simplification here, we'll use a large enough vocab size and HF's pad_token_id
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id

print(f"Vocab size (from HF tokenizer): {vocab_size}")
print(f"Pad token ID (from HF tokenizer): {pad_idx}")

Vocab size (from HF tokenizer): 30522
Pad token ID (from HF tokenizer): 0


# TASK 2: LSTM Model (Minor adjustments for new data processing)

In [14]:
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:

class LSTMTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.embedding(text)
        # embedded shape: [batch_size, seq_len, embedding_dim]

        output, (hidden, cell) = self.lstm(embedded)
        # output shape: [batch_size, seq_len, hidden_dim * n_directions]

        if self.lstm.bidirectional:
            # Concatenate the last two hidden states (forward and backward)
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        # hidden shape: [batch_size, hidden_dim * n_directions]

        hidden = self.dropout(hidden)
        return self.fc(hidden)

# Initialize LSTM model
embedding_dim = 100
hidden_dim = 128 # Reduced hidden_dim for smaller model
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.5

lstm_model = LSTMTextClassifier(
    vocab_size, embedding_dim, hidden_dim, output_dim,
    n_layers, bidirectional, dropout, pad_idx
).to(device)


# TASK 3: LSTM Training

In [17]:
import torch.optim as optim
import time


In [18]:
optimizer_lstm = optim.Adam(lstm_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

def train_model(model, dataloader, optimizer, criterion, epochs=3): # Reduced epochs for faster demo
    model.train()
    start_time = time.time()

    epoch_losses = []
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_idx, (text, labels) in enumerate(dataloader):
            text, labels = text.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(text)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            if batch_idx % 1 == 0: # Print more frequently for tiny dataset
                print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(dataloader)}, Loss: {loss.item():.4f}")

        avg_epoch_loss = epoch_loss / len(dataloader)
        epoch_losses.append(avg_epoch_loss)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_epoch_loss:.4f}")

    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")

    return epoch_losses, training_time

def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    start_time = time.time()

    with torch.no_grad():
        for text, labels in dataloader:
            text, labels = text.to(device), labels.to(device)
            outputs = model(text)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    inference_time = time.time() - start_time
    accuracy = 100 * correct / total

    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Inference time: {inference_time:.2f} seconds")

    return accuracy, inference_time

# Train and evaluate LSTM
print("\n--- Training LSTM Model ---")
lstm_losses, lstm_train_time = train_model(lstm_model, train_dataloader, optimizer_lstm, criterion, epochs=3)
print("\n--- Evaluating LSTM Model ---")
lstm_accuracy, lstm_inference_time = evaluate_model(lstm_model, test_dataloader)




--- Training LSTM Model ---
Epoch 1/3, Batch 0/1000, Loss: 0.6929
Epoch 1/3, Batch 1/1000, Loss: 0.6476
Epoch 1/3, Batch 2/1000, Loss: 0.6210
Epoch 1/3, Batch 3/1000, Loss: 0.5597
Epoch 1/3, Batch 4/1000, Loss: 0.5362
Epoch 1/3, Batch 5/1000, Loss: 0.5236
Epoch 1/3, Batch 6/1000, Loss: 0.4344
Epoch 1/3, Batch 7/1000, Loss: 0.4593
Epoch 1/3, Batch 8/1000, Loss: 0.3176
Epoch 1/3, Batch 9/1000, Loss: 0.3197
Epoch 1/3, Batch 10/1000, Loss: 0.2487
Epoch 1/3, Batch 11/1000, Loss: 0.2003
Epoch 1/3, Batch 12/1000, Loss: 0.1275
Epoch 1/3, Batch 13/1000, Loss: 0.1352
Epoch 1/3, Batch 14/1000, Loss: 0.1230
Epoch 1/3, Batch 15/1000, Loss: 0.0435
Epoch 1/3, Batch 16/1000, Loss: 0.0203
Epoch 1/3, Batch 17/1000, Loss: 0.0086
Epoch 1/3, Batch 18/1000, Loss: 0.0109
Epoch 1/3, Batch 19/1000, Loss: 0.0044
Epoch 1/3, Batch 20/1000, Loss: 0.0033
Epoch 1/3, Batch 21/1000, Loss: 0.0032
Epoch 1/3, Batch 22/1000, Loss: 0.0020
Epoch 1/3, Batch 23/1000, Loss: 0.0015
Epoch 1/3, Batch 24/1000, Loss: 0.0014
Epoch 

# TASK 4: Pre-trained Transformer

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)


In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch, time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer_transformer = AutoTokenizer.from_pretrained(transformer_model_name)
model_transformer = AutoModelForSequenceClassification.from_pretrained(transformer_model_name).to(device)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model_transformer,
    tokenizer=tokenizer_transformer,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    max_length=512
)


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

In [30]:
print("\n--- Evaluating Pre-trained Transformer Model ---")

def evaluate_transformer(texts, labels, pipeline_model):
    correct = 0
    total = 0
    start_time = time.time()

    for text, true_label in zip(texts, labels):

        # ✅ Force truncation at call-time (important)
        result = pipeline_model(
            text,
            truncation=True,
            max_length=512
        )[0]

        predicted_sentiment = result["label"]

        # Map 'POSITIVE' → 1, 'NEGATIVE' → 0
        predicted_class = 1 if predicted_sentiment == "POSITIVE" else 0

        if predicted_class == true_label:
            correct += 1

        total += 1

    inference_time = time.time() - start_time
    accuracy = 100 * correct / total

    print(f"Transformer Accuracy: {accuracy:.2f}%")
    print(f"Transformer Inference time: {inference_time:.2f} seconds")

    return accuracy, inference_time


# Run evaluation
transformer_accuracy, transformer_inference_time = evaluate_transformer(
    test_texts,
    test_labels,
    sentiment_pipeline
)




--- Evaluating Pre-trained Transformer Model ---
Transformer Accuracy: 90.60%
Transformer Inference time: 228.83 seconds


In [32]:
# Load transformer model (with safe truncation)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer_transformer = AutoTokenizer.from_pretrained(transformer_model_name)
model_transformer = AutoModelForSequenceClassification.from_pretrained(transformer_model_name).to(device)

# ✅ Create pipeline (GPU if available)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model_transformer,
    tokenizer=tokenizer_transformer,
    device=0 if torch.cuda.is_available() else -1
)

print("\n--- Evaluating Pre-trained Transformer Model ---")

def evaluate_transformer(texts, labels, pipeline_model):
    correct = 0
    total = 0
    start_time = time.time()

    for text, true_label in zip(texts, labels):
        # ✅ Force truncation at call-time to avoid >512 token crash
        result = pipeline_model(text, truncation=True, max_length=512)[0]
        predicted_sentiment = result["label"]

        # Map 'POSITIVE' -> 1, 'NEGATIVE' -> 0
        predicted_class = 1 if predicted_sentiment == "POSITIVE" else 0

        if predicted_class == int(true_label):
            correct += 1
        total += 1

    inference_time = time.time() - start_time
    accuracy = 100 * correct / total if total else 0.0

    print(f"Transformer Accuracy: {accuracy:.2f}%")
    print(f"Transformer Inference time: {inference_time:.2f} seconds")

    return accuracy, inference_time


# Run evaluation
transformer_accuracy, transformer_inference_time = evaluate_transformer(
    test_texts, test_labels, sentiment_pipeline
)


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]


--- Evaluating Pre-trained Transformer Model ---
Transformer Accuracy: 90.60%
Transformer Inference time: 228.13 seconds


# TASK 5: Model Comparison

In [34]:
import matplotlib.pyplot as plt
import torch

def compare_models():
    # Accuracy comparison
    models = ['LSTM', 'Transformer']
    accuracies = [lstm_accuracy, transformer_accuracy]

    plt.figure(figsize=(10, 5))
    plt.bar(models, accuracies, color=['blue', 'orange'])
    plt.title('Model Accuracy Comparison')
    plt.ylabel('Accuracy (%)')
    plt.ylim(0, 100)
    for i, v in enumerate(accuracies):
        plt.text(i, v + 1, f"{v:.2f}%", ha='center')
    plt.savefig('accuracy_comparison_simplified.png')
    plt.close() # Close plot to prevent display issues in some environments

    # Inference time comparison
    times = [lstm_inference_time, transformer_inference_time]

    plt.figure(figsize=(10, 5))
    plt.bar(models, times, color=['blue', 'orange'])
    plt.title('Inference Time Comparison')
    plt.ylabel('Time (seconds)')
    for i, v in enumerate(times):
        plt.text(i, v + 0.1, f"{v:.2f}s", ha='center')
    plt.savefig('time_comparison_simplified.png')
    plt.close()

    # Print comparison table
    print("\n--- Model Comparison Summary ---")
    print("-" * 60)
    print(f"{'Metric':<20} | {'LSTM':<15} | {'Transformer':<15}")
    print("-" * 60)
    print(f"{'Accuracy':<20} | {lstm_accuracy:<15.2f}% | {transformer_accuracy:<15.2f}%")
    print(f"{'Inference Time':<20} | {lstm_inference_time:<15.2f}s | {transformer_inference_time:<15.2f}s")
    print(f"{'Training Time':<20} | {lstm_train_time:<15.2f}s | {'N/A (pre-trained)':<15}")

    # Parameter count
    lstm_params = sum(p.numel() for p in lstm_model.parameters())
    transformer_params = sum(p.numel() for p in model_transformer.parameters())
    print(f"{'Parameters':<20} | {lstm_params:<15,d} | {transformer_params:<15,d}")
    print("-" * 60)

compare_models()

# Specific example comparison
def compare_specific_examples():
    sample_texts = [
        "This movie was absolutely fantastic! I loved every minute of it.",
        "The film was neither good nor bad, just mediocre overall.",
        "What a terrible waste of time and money. Worst movie ever."
    ]

    print("\n--- Example Predictions ---")
    print("-" * 80)
    print(f"{'Text':<40} | {'LSTM Prediction':<20} | {'Transformer Prediction':<20}")
    print("-" * 80)

    lstm_model.eval()
    for text in sample_texts:
        # LSTM prediction
        processed = process_text_hf(text, tokenizer, max_len).unsqueeze(0).to(device)
        with torch.no_grad():
            lstm_output = torch.sigmoid(lstm_model(processed)).item()
        lstm_sentiment = "Positive" if lstm_output > 0.5 else "Negative"
        lstm_confidence = max(lstm_output, 1 - lstm_output) * 100

        # Transformer prediction using the pipeline
        transformer_result = sentiment_pipeline(text)[0]
        transformer_sentiment = transformer_result['label']
        transformer_confidence = transformer_result['score'] * 100

        print(f"{text[:37] + '...':<40} | {lstm_sentiment} ({lstm_confidence:.1f}%) | {transformer_sentiment} ({transformer_confidence:.1f}%)")

    print("-" * 80)

compare_specific_examples()


--- Model Comparison Summary ---
------------------------------------------------------------
Metric               | LSTM            | Transformer    
------------------------------------------------------------
Accuracy             | 100.00         % | 90.60          %
Inference Time       | 4.48           s | 228.13         s
Training Time        | 244.73         s | N/A (pre-trained)
Parameters           | 3,683,241       | 66,955,010     
------------------------------------------------------------

--- Example Predictions ---
--------------------------------------------------------------------------------
Text                                     | LSTM Prediction      | Transformer Prediction
--------------------------------------------------------------------------------
This movie was absolutely fantastic! ... | Negative (100.0%) | POSITIVE (100.0%)
The film was neither good nor bad, ju... | Negative (100.0%) | NEGATIVE (100.0%)
What a terrible waste of time and mon... | Negati