In [1]:
import os

# CRITICAL: Set environment variables before importing any other libraries.
# This prevents deadlocks related to tokenizers and external reporting tools (like W&B).
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

In [2]:
!pip install --upgrade transformers --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.6.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.5.1 which is incompatible.[0m[31m
[0m

In [3]:
# import transformers
# print(transformers.__version__)

In [4]:
# print(transformers.__file__)

In [5]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification

2025-08-13 05:30:15.044977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755063015.244244      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755063015.305473      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
import transformers
print(transformers.__version__)
print(transformers.TrainingArguments)
print(type(transformers.TrainingArguments))

4.55.0
<class 'transformers.training_args.TrainingArguments'>
<class 'type'>


In [7]:
# from transformers import TrainingArguments
# print(TrainingArguments)
# print(type(TrainingArguments))

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

# Quick peek
print(df.head())
print(df['sentiment'].value_counts())

# Map labels to ints (positive=1, negative=0)
label_map = {'positive': 1, 'negative': 0}
df['label'] = df['sentiment'].map(label_map)

# Subsample for faster runtime: stratified split to keep label distribution balanced
train_df, test_df = train_test_split(df, train_size=5000, test_size=1000, stratify=df['label'], random_state=42)

print(f"Train subset size: {len(train_df)}")
print(f"Test subset size: {len(test_df)}")
print(train_df['label'].value_counts())
print(test_df['label'].value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Train subset size: 5000
Test subset size: 1000
label
0    2500
1    2500
Name: count, dtype: int64
label
1    500
0    500
Name: count, dtype: int64


In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['review', 'sentiment', 'label'],
    num_rows: 5000
})
Dataset({
    features: ['review', 'sentiment', 'label'],
    num_rows: 1000
})


In [10]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"  # You can change this later for other models
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example['review'], padding="max_length", truncation=True, max_length=512)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
# Prepare dataset for the Trainer by renaming the 'label' column to 'labels'
# and removing columns that are no longer needed.
train_tokenized = train_tokenized.rename_column("label", "labels")
test_tokenized = test_tokenized.rename_column("label", "labels")
train_tokenized = train_tokenized.remove_columns(['review', 'sentiment'])
test_tokenized = test_tokenized.remove_columns(['review', 'sentiment'])

# Set the format to PyTorch tensors for the Trainer
train_tokenized.set_format('torch')
test_tokenized.set_format('torch')

print("Processed training dataset features:")
print(train_tokenized)

Processed training dataset features:
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})


In [12]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
import gc

# Free up memory before starting training
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define the function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

# Define training arguments with settings safe for notebooks
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    learning_rate=2e-5,
    
    # Batch sizes and gradient accumulation
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
    
    # Evaluation and logging strategies
    eval_strategy="steps",
    eval_steps=250,      # Evaluate periodically
    logging_steps=100,   # Log training loss periodically
    
    # Key settings to prevent hangs and optimize performance
    fp16=True,                           # Use mixed-precision for faster training on compatible GPUs
    dataloader_num_workers=0,            # CRITICAL: Must be 0 in Kaggle notebooks
    report_to=[],                        # Disables external reporting
    save_strategy="no",                  # Do not save model checkpoints during training
    load_best_model_at_end=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized, # Use the cleaned dataset from the previous cell
    eval_dataset=test_tokenized,   # Use the cleaned dataset from the previous cell
    compute_metrics=compute_metrics,
)

# Start training!
print("🚀 Starting training...")
trainer.train()

# Run evaluation on the test set
print("\n✅ Training complete. Running evaluation...")
eval_results = trainer.evaluate()
print(f"\nEvaluation results: {eval_results}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting training...




Step,Training Loss,Validation Loss,Accuracy,F1
250,0.2787,0.244984,0.917,0.916939





✅ Training complete. Running evaluation...



Evaluation results: {'eval_loss': 0.2164091020822525, 'eval_accuracy': 0.919, 'eval_f1': 0.9189934384685159, 'eval_runtime': 18.9399, 'eval_samples_per_second': 52.799, 'eval_steps_per_second': 3.326, 'epoch': 1.0}


In [13]:
# 1. Install and Set Environment
# -------------------------------
!pip install --upgrade transformers datasets accelerate -q

import os
# This prevents deadlocks in Kaggle notebooks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

# 2. Imports
# -----------
import pandas as pd
import numpy as np
import torch
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)

# 3. Load Full Dataset
# --------------------
print("Loading the full IMDB dataset...")
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

# Map labels to integers
label_map = {'positive': 1, 'negative': 0}
df['label'] = df['sentiment'].map(label_map)

# Split into training and testing sets (full size)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

print(f"Full training set size: {len(train_df)}")
print(f"Full test set size: {len(test_df)}")

# Create a smaller subset for quick model comparison
sub_train_df, _ = train_test_split(
    train_df,
    train_size=5000,
    stratify=train_df['label'],
    random_state=42
)
sub_test_df, _ = train_test_split(
    test_df,
    test_size=1000,
    stratify=test_df['label'],
    random_state=42
)

print(f"\nSubset training size for comparison: {len(sub_train_df)}")
print(f"Subset testing size for comparison: {len(sub_test_df)}")

# Convert to Hugging Face Dataset objects
train_dataset_full = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset_full = Dataset.from_pandas(test_df.reset_index(drop=True))

train_dataset_sub = Dataset.from_pandas(sub_train_df.reset_index(drop=True))
test_dataset_sub = Dataset.from_pandas(sub_test_df.reset_index(drop=True))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31

In [14]:
# 1. Define Models and Evaluation Function
# ----------------------------------------
model_checkpoints = {
    "DistilBERT": "distilbert-base-uncased",
    "ELECTRA": "google/electra-small-discriminator",
    "DeBERTa-v3": "microsoft/deberta-v3-base", # Substitute for "ModernBERT"
    "ALBERT": "albert-base-v2",               # Substitute for "Ettin"
    "GTE": "thenlper/gte-base"                # As requested. Note: GTE is primarily for embeddings.
}

# This is our custom F1 score function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"f1": f1}

# 2. Loop Through Models for Fine-tuning
# --------------------------------------
results = {}

for name, checkpoint in model_checkpoints.items():
    print(f"--- Starting fine-tuning for {name} ({checkpoint}) ---")
    
    # a. Tokenize Data
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    def tokenize_function(examples):
        return tokenizer(examples['review'], padding="max_length", truncation=True, max_length=512)
        
    train_tokenized = train_dataset_sub.map(tokenize_function, batched=True)
    test_tokenized = test_dataset_sub.map(tokenize_function, batched=True)

    # b. Prepare Dataset for Trainer (Corrected Line)
    train_processed = train_tokenized.rename_column("label", "labels").remove_columns(['review', 'sentiment'])
    test_processed = test_tokenized.rename_column("label", "labels").remove_columns(['review', 'sentiment'])
    train_processed.set_format('torch')
    test_processed.set_format('torch')

    # c. Setup Model and Trainer
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    
    training_args = TrainingArguments(
        output_dir=f'./results_{name}',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        dataloader_num_workers=0,
        logging_steps=100,
        report_to=[],
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_processed,
        eval_dataset=test_processed,
        compute_metrics=compute_metrics,
    )
    
    # d. Train and Evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    results[name] = eval_result['eval_f1']
    
    print(f"--- F1 Score for {name}: {eval_result['eval_f1']:.4f} ---")
    
    # e. Clean up memory
    del model, trainer, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

# 3. Find and Display the Best Model
# -----------------------------------
print("\n--- Model Comparison Results ---")
for name, f1 in results.items():
    print(f"{name}: {f1:.4f}")

best_model_name = max(results, key=results.get)
best_model_checkpoint = model_checkpoints[best_model_name]
print(f"\n🏆 Best performing model on the subset: {best_model_name} ({best_model_checkpoint}) with F1 score: {results[best_model_name]:.4f}")

--- Starting fine-tuning for DistilBERT (distilbert-base-uncased) ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2402,0.255422,0.898427




--- F1 Score for DistilBERT: 0.8984 ---
--- Starting fine-tuning for ELECTRA (google/electra-small-discriminator) ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,0.4902,0.473122,0.834295




--- F1 Score for ELECTRA: 0.8343 ---
--- Starting fine-tuning for DeBERTa-v3 (microsoft/deberta-v3-base) ---


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1877,0.207711,0.945439




--- F1 Score for DeBERTa-v3: 0.9454 ---
--- Starting fine-tuning for ALBERT (albert-base-v2) ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2497,0.225378,0.916566




--- F1 Score for ALBERT: 0.9166 ---
--- Starting fine-tuning for GTE (thenlper/gte-base) ---


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at thenlper/gte-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2122,0.205423,0.924111




--- F1 Score for GTE: 0.9241 ---

--- Model Comparison Results ---
DistilBERT: 0.8984
ELECTRA: 0.8343
DeBERTa-v3: 0.9454
ALBERT: 0.9166
GTE: 0.9241

🏆 Best performing model on the subset: DeBERTa-v3 (microsoft/deberta-v3-base) with F1 score: 0.9454


In [None]:
print(f"--- Starting FULL fine-tuning for the best model: {best_model_name} ---")

# 1. Tokenize the FULL dataset
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(best_model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True, max_length=512)

train_tokenized_full = train_dataset_full.map(tokenize_function, batched=True, remove_columns=['review', 'sentiment'])
test_tokenized_full = test_dataset_full.map(tokenize_function, batched=True, remove_columns=['review', 'sentiment'])

# 2. Prepare datasets for the Trainer
# -----------------------------------
train_processed_full = train_tokenized_full.rename_column("label", "labels")
test_processed_full = test_tokenized_full.rename_column("label", "labels")
train_processed_full.set_format('torch')
test_processed_full.set_format('torch')

# 3. Setup Model and Training Arguments for the full run
# ------------------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(best_model_checkpoint, num_labels=2)

# Adjust arguments for the larger dataset
training_args_full = TrainingArguments(
    output_dir=f'./results_full_{best_model_name}',
    num_train_epochs=1, # One epoch is often enough for large datasets
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    dataloader_num_workers=0,
    report_to=[],
)

trainer_full = Trainer(
    model=model,
    args=training_args_full,
    train_dataset=train_processed_full,
    eval_dataset=test_processed_full,
    compute_metrics=compute_metrics,
)

# 4. Train and Evaluate on the full dataset
# -----------------------------------------
trainer_full.train()
final_eval_results = trainer_full.evaluate()

print("\n--- Final Model Evaluation (on full test set) ---")
print(f"Accuracy: {final_eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {final_eval_results['eval_f1']:.4f}")

# Save the final model and tokenizer
final_model_path = f"./final_model_{best_model_name}"
trainer_full.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

In [None]:
print("--- Running Inference on 10 Random Samples ---")

# 1. Load the fine-tuned model using a pipeline
# ----------------------------------------------
# The pipeline automatically handles tokenization, model loading, and output formatting.
final_model_path = f"./final_model_{best_model_name}"
classifier = pipeline("text-classification", model=final_model_path, device=0) # Use 0 for GPU

# 2. Sample 10 reviews from the original test dataframe
# -----------------------------------------------------
sample_reviews = test_df.sample(10, random_state=42)
reviews_list = sample_reviews['review'].tolist()
ground_truth_labels = sample_reviews['sentiment'].tolist()

# 3. Run predictions
# ------------------
predictions = classifier(reviews_list)

# 4. Display results
# ------------------
label_to_sentiment = { "LABEL_1": "positive", "LABEL_0": "negative" }

for i in range(10):
    review = reviews_list[i]
    true_label = ground_truth_labels[i]
    pred_label_str = predictions[i]['label']
    pred_sentiment = label_to_sentiment[pred_label_str]
    pred_score = predictions[i]['score']
    
    print(f"\n--- Review #{i+1} ---")
    print(f"Review: {review[:300]}...") # Print first 300 characters
    print(f"✅ Ground Truth: {true_label}")
    print(f"🤖 Prediction: {pred_sentiment} (Score: {pred_score:.4f})")
    if true_label == pred_sentiment:
        print("Correct! 👍")
    else:
        print("Incorrect. 👎")