In [2]:
# ===============================================
# STEP 1: Install & Import Dependencies
# ===============================================
!pip install transformers datasets scikit-learn torch --quiet

import torch
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

print("✅ Libraries loaded!")

# ===============================================
# STEP 2: Load Dataset (IMDb movie reviews)
# ===============================================
dataset = load_dataset("imdb")
print(dataset)

# ===============================================
# STEP 3: Tokenization
# ===============================================
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))   # small subset for speed
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

print("✅ Tokenization complete!")

# ===============================================
# STEP 4: Load Model
# ===============================================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# ===============================================
# STEP 5: Training Arguments (compatible version)
# ===============================================
training_args = TrainingArguments(
    output_dir="./results",
    evaluate_during_training=True,   # old flag instead of evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50
)

# ===============================================
# STEP 6: Metrics Function
# ===============================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# ===============================================
# STEP 7: Trainer
# ===============================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ===============================================
# STEP 8: Train the Model
# ===============================================
trainer.train()

# ===============================================
# STEP 9: Evaluate
# ===============================================
results = trainer.evaluate()
print("✅ Evaluation:", results)

# ===============================================
# STEP 10: Inference
# ===============================================
sample = "The movie was amazing and I loved it!"
inputs = tokenizer(sample, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits)
print("✅ Sentiment:", "Positive" if prediction.item() == 1 else "Negative")


✅ Libraries loaded!
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Tokenization complete!


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluate_during_training'