In [16]:
!pip install -q datasets transformers torch accelerate evaluate
print("Installation complete.")

Installation complete.


In [17]:
!pip install --upgrade datasets fsspec huggingface-hub

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [18]:
!nvidia-smi

Fri Jun 27 17:36:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   60C    P0             31W /   70W |    7156MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [19]:
import torch
import time
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import numpy as np
import evaluate

In [20]:
class SentimentClassifier:
    def __init__(self, model_name='bert-base-uncased', num_labels=2):
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        # We will set num_labels and label mappings after inspecting the dataset
        self.model = BertForSequenceClassification.from_pretrained(self.model_name, num_labels=num_labels)
        self.trainer = None

    @classmethod
    def load_from_checkpoint(cls, checkpoint_path):
        """Load a trained model and tokenizer from a checkpoint directory."""
        print(f"Loading model from {checkpoint_path}")
        instance = cls.__new__(cls) # Create an instance without calling __init__
        instance.model = BertForSequenceClassification.from_pretrained(checkpoint_path)
        instance.tokenizer = BertTokenizer.from_pretrained(checkpoint_path)
        instance.trainer = None
        return instance

    def load_and_tokenize_data(self, dataset_name, train_size, test_size, shuffle_seed=42):
        """
        Loads, shuffles, selects, and tokenizes the data.
        It also inspects the dataset to determine the label mapping.
        """
        print(f"Loading '{dataset_name}' dataset...")
        full_dataset = load_dataset(dataset_name)

        # --- Automatically detect label mapping ---
        label_feature = full_dataset["train"].features["label"]
        self.model.config.id2label = {i: label_feature.names[i] for i in range(label_feature.num_classes)}
        self.model.config.label2id = {v: k for k, v in self.model.config.id2label.items()}
        print(f"Discovered Label Mapping: {self.model.config.id2label}")

        # --- Shuffle and select a subset ---
        print(f"Shuffling dataset with seed {shuffle_seed} and selecting {train_size} train / {test_size} test samples.")
        train_dataset = full_dataset["train"].shuffle(seed=shuffle_seed).select(range(train_size))
        test_dataset = full_dataset["test"].shuffle(seed=shuffle_seed).select(range(test_size))

        def tokenize_fn(examples):
            return self.tokenizer(examples["text"], truncation=True, max_length=512)

        print("Tokenizing datasets...")
        tokenized_train = train_dataset.map(tokenize_fn, batched=True).remove_columns(["text"])
        tokenized_test = test_dataset.map(tokenize_fn, batched=True).remove_columns(["text"])

        return tokenized_train, tokenized_test

    def train(self, output_dir, dataset_name, train_size, test_size, shuffle_seed, training_args_dict):
        """Initializes and runs the Trainer."""
        train_data, eval_data = self.load_and_tokenize_data(
            dataset_name=dataset_name,
            train_size=train_size,
            test_size=test_size,
            shuffle_seed=shuffle_seed
        )

        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        metric = evaluate.load("accuracy")
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return metric.compute(predictions=predictions, references=labels)

        training_args = TrainingArguments(
            output_dir=output_dir,
            logging_dir=f"{output_dir}/logs",
            **training_args_dict
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=eval_data,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        print("Starting training...")
        start_time = time.time()
        self.trainer.train()
        end_time = time.time()
        print(f"Training finished in {end_time - start_time:.2f} seconds.")

        self.trainer.save_model(output_dir)
        print(f"Model saved to {output_dir}")

    def evaluate(self):
        """Evaluates the model on the evaluation set."""
        if not self.trainer:
            raise ValueError("Trainer not initialized. Call train() first.")
        print("Evaluating model...")
        return self.trainer.evaluate()

    def predict(self, texts):
        """Predicts the sentiment for a list of texts."""
        if isinstance(texts, str):
            texts = [texts]

        device = self.model.device
        inputs = self.tokenizer(texts, return_tensors="pt", truncation=True, padding=True).to(device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions_idx = torch.argmax(probs, dim=1)

        # Use the stored label mapping to return human-readable labels
        id2label = self.model.config.id2label
        labels = [id2label[idx.item()] for idx in predictions_idx]

        return labels, probs.tolist()

In [21]:
def run_experiment(config):
    """A wrapper to run a full experiment based on a configuration dictionary."""
    print(f"\n{'='*20} Running Experiment: {config['name']} {'='*20}")

    # Initialize the classifier
    classifier = SentimentClassifier(model_name=config['model_name'])

    # Free up GPU memory before starting
    torch.cuda.empty_cache()

    # Train the model
    classifier.train(
        output_dir=config['output_dir'],
        dataset_name=config['dataset_name'],
        train_size=config['train_size'],
        test_size=config['test_size'],
        shuffle_seed=config['shuffle_seed'],
        training_args_dict=config['training_args']
    )

    # Evaluate the model
    final_eval = classifier.evaluate()
    print(f"Final Evaluation for {config['name']}: {final_eval}")

    # Show memory usage
    if torch.cuda.is_available():
        peak_memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
        print(f"Peak GPU Memory Usage for {config['name']}: {peak_memory_mb:.2f} MB")
        torch.cuda.reset_peak_memory_stats()

    # --- Prediction Example ---
    print("\n--- Prediction Example ---")
    positive_text = "This movie was absolutely fantastic! A must-see."
    negative_text = "It was a complete waste of my time and money."

    pred_labels, _ = classifier.predict(positive_text)
    print(f"Sample Text: '{positive_text}'")
    print(f"Prediction: {pred_labels[0].upper()}")

    pred_labels, _ = classifier.predict(negative_text)
    print(f"Sample Text: '{negative_text}'")
    print(f"Prediction: {pred_labels[0].upper()}")

    # --- Loading from Disk Example ---
    print("\n--- Loading from Disk and Predicting ---")
    loaded_classifier = SentimentClassifier.load_from_checkpoint(config['output_dir'])
    pred_labels, _ = loaded_classifier.predict("I'm not sure how I feel about this film.")
    print(f"Prediction for neutral text: {pred_labels[0].upper()}")

In [22]:
# Experiment 1: Baseline
config_bert_base = {
    "name": "BERT-base-fine-tuned",
    "model_name": "bert-base-uncased",
    "dataset_name": "imdb",
    "output_dir": "/content/bert-base-imdb",
    "train_size": 2000,
    "test_size": 500,
    "shuffle_seed": 42,
    "training_args": {
        "num_train_epochs": 3,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 8,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "logging_steps": 50,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",
        "load_best_model_at_end": True,
        "report_to": "none", # Disable Weights & Biases logging
    }
}

In [23]:
# Experiment 2: Smaller & Faster DistilBERT model
config_distilbert = {
    "name": "DistilBERT-fine-tuned",
    "model_name": "distilbert-base-uncased",
    "dataset_name": "imdb",
    "output_dir": "/content/distilbert-imdb",
    "train_size": 2000,
    "test_size": 500,
    "shuffle_seed": 42,
    "training_args": {
        "num_train_epochs": 3,
        "per_device_train_batch_size": 16, # Can use larger batch size for smaller model
        "per_device_eval_batch_size": 16,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "logging_steps": 50,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",
        "load_best_model_at_end": True,
        "report_to": "none",
    }
}


In [24]:
run_experiment(config_bert_base)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading 'imdb' dataset...
Discovered Label Mapping: {0: 'neg', 1: 'pos'}
Shuffling dataset with seed 42 and selecting 2000 train / 500 test samples.
Tokenizing datasets...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  self.trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2943,0.368473,0.878
2,0.1627,0.480384,0.892
3,0.0982,0.430239,0.908


Training finished in 797.32 seconds.
Model saved to /content/bert-base-imdb
Evaluating model...


Final Evaluation for BERT-base-fine-tuned: {'eval_loss': 0.36847320199012756, 'eval_accuracy': 0.878, 'eval_runtime': 13.7489, 'eval_samples_per_second': 36.367, 'eval_steps_per_second': 4.582, 'epoch': 3.0}
Peak GPU Memory Usage for BERT-base-fine-tuned: 4269.92 MB

--- Prediction Example ---
Sample Text: 'This movie was absolutely fantastic! A must-see.'
Prediction: POS
Sample Text: 'It was a complete waste of my time and money.'
Prediction: NEG

--- Loading from Disk and Predicting ---
Loading model from /content/bert-base-imdb
Prediction for neutral text: NEG


In [25]:
run_experiment(config_distilbert)





The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.

Loading 'imdb' dataset...
Discovered Label Mapping: {0: 'neg', 1: 'pos'}
Shuffling dataset with seed 42 and selecting 2000 train / 500 test samples.
Tokenizing datasets...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  self.trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7136,0.693317,0.492
2,0.7041,0.696072,0.508
3,0.6963,0.694729,0.492


Training finished in 768.46 seconds.
Model saved to /content/distilbert-imdb
Evaluating model...


Final Evaluation for DistilBERT-fine-tuned: {'eval_loss': 0.6933168768882751, 'eval_accuracy': 0.492, 'eval_runtime': 14.2601, 'eval_samples_per_second': 35.063, 'eval_steps_per_second': 2.244, 'epoch': 3.0}
Peak GPU Memory Usage for DistilBERT-fine-tuned: 6816.74 MB

--- Prediction Example ---
Sample Text: 'This movie was absolutely fantastic! A must-see.'
Prediction: POS
Sample Text: 'It was a complete waste of my time and money.'
Prediction: POS

--- Loading from Disk and Predicting ---
Loading model from /content/distilbert-imdb
Prediction for neutral text: POS


In [26]:
from datasets import load_dataset

try:
    print("Attempting to load imdb dataset...")
    # Loading a small split to quickly test if the issue is with loading itself
    test_dataset_load = load_dataset("imdb", split="train[:10]")
    print("Dataset loaded successfully.")
    print(test_dataset_load)
except Exception as e:
    print(f"Error loading dataset: {e}")
    import traceback
    traceback.print_exc()

Attempting to load imdb dataset...
Dataset loaded successfully.
Dataset({
    features: ['text', 'label'],
    num_rows: 10
})
