# TP 4 - BERT (Bidirectional Encoder Representations from Transformers)

L'objectif de ce TP est d'utiliser BERT pour l'analyse de sentiment en finetunant le modèle de base.

In [None]:
# 1. Import packages
!pip install datasets transformers accelerate

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Variables to set the number of epochs and samples
num_epochs = 2 # Reduced for demo purposes (instructions say 10 but that takes hours)
num_samples = 500 # Set to -1 to use all data (but takes longer)

print("Packages imported.")

In [None]:
# 2. Load dataset
dataset = load_dataset('imdb')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# 3. Explore the data
train_df = pd.DataFrame(dataset['train'])
print(train_df.head())

sns.countplot(x='label', data=train_df)
plt.title('Class distribution')
plt.show()

In [None]:
# 4. Pre treatment, mostly tokenizer
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# 5. Train – test data.
if num_samples == -1:
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)
else:
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(num_samples))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(num_samples))

print(f"Train size: {len(small_train_dataset)}")
print(f"Test size: {len(small_eval_dataset)}")

In [None]:
# 6. Load pre trained model and run the training
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=num_epochs,
    no_cuda=not torch.cuda.is_available(), # Use GPU if available
)

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# 7. Evaluation
predictions = trainer.predict(small_eval_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

print("Accuracy:", accuracy_score(small_eval_dataset['label'], preds))

cm = confusion_matrix(small_eval_dataset['label'], preds)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# 8. Inference on a new sample
text = "This is a fantastic movie. I really enjoyed it."
inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Move inputs to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print(f"Text: {text}")
print(f"Pred: {predicted_class_id} ({'Positive' if predicted_class_id == 1 else 'Negative'})")