# 🧪 Test avec le modèle pré-entraîné tel quel

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Charger le modèle DistilBERT de base (non fine-tuné)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ajouter une tête de classification (non entraînée)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return "Positif" if predicted_class == 1 else "Négatif"

# Tester avec des phrases
examples = [
    "This movie was fantastic! I really enjoyed it.",
    "I hated this film, it was so bad.",
    "The plot was interesting, but the acting was terrible.",
    "An absolute masterpiece, one of the best movies I have ever seen.",
    "It was okay, nothing special but not the worst either.",
    "I fell asleep halfway through, it was so boring.",
    "Amazing cinematography and great performances from the cast."
]
for sentence in examples:
    print(f"Texte: {sentence}")
    print(f"Sentiment prédit: {predict_sentiment(sentence)}\n")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Texte: This movie was fantastic! I really enjoyed it.
Sentiment prédit: Positif

Texte: I hated this film, it was so bad.
Sentiment prédit: Positif

Texte: The plot was interesting, but the acting was terrible.
Sentiment prédit: Positif

Texte: An absolute masterpiece, one of the best movies I have ever seen.
Sentiment prédit: Positif

Texte: It was okay, nothing special but not the worst either.
Sentiment prédit: Positif

Texte: I fell asleep halfway through, it was so boring.
Sentiment prédit: Positif

Texte: Amazing cinematography and great performances from the cast.
Sentiment prédit: Positif



# 📂 Chargement du jeu de données IMDB

In [2]:
from datasets import load_dataset
import random

# Charge automatiquement le dataset IMDB
dataset = load_dataset("imdb")

# Sélectionner 250 exemples positifs et 250 négatifs pour l'entraînement
positive_samples = [ex for ex in dataset["train"] if ex["label"] == 1][:250]
negative_samples = [ex for ex in dataset["train"] if ex["label"] == 0][:250]
small_train_dataset = positive_samples + negative_samples

# Mélanger les données pour éviter les biais d'ordre
random.shuffle(small_train_dataset)

# Réduire aussi la taille du dataset test en équilibrant
positive_test_samples = [ex for ex in dataset["test"] if ex["label"] == 1][:50]
negative_test_samples = [ex for ex in dataset["test"] if ex["label"] == 0][:50]
small_test_dataset = positive_test_samples + negative_test_samples

# Affiche un échantillon
def print_sample(index=0):
    print(f"Exemple index {index} :")
    print("Texte :", small_train_dataset[index]["text"])
    print("Label :", "Positif" if small_train_dataset[index]["label"] == 1 else "Négatif")

# Affiche un échantillon du jeu de données d'entraînement
print_sample()

Exemple index 0 :
Texte : I found this to be a so-so romance/drama that has a nice ending and a generally nice feel to it. It's not a Hallmark Hall Of Fame-type family film with sleeping-before-marriage considered "normal" behavior but considering it stars Jane Fonda and Robert De Niro, I would have expected a lot rougher movie, at least language-wise. <br /><br />The most memorable part of the film is the portrayal of how difficult it must be to learn how to read and write when you are already an adult. That's the big theme of the movie and it involves some touching scenes but, to be honest, the film isn't that memorable.<br /><br />It's still a fairly mild, nice tale that I would be happy to recommend.
Label : Positif


# 🔧 Pré-traitement des données

In [3]:
from transformers import AutoTokenizer
from datasets import Dataset

# Charge le tokenizer de _DistilBERT_
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenisation des sous-ensembles
tokenized_train = tokenizer([ex['text'] for ex in small_train_dataset], padding=True, truncation=True, max_length=512)
tokenized_test = tokenizer([ex['text'] for ex in small_test_dataset], padding=True, truncation=True, max_length=512)


# Ajouter les labels aux datasets tokenisés
tokenized_train['labels'] = [ex['label'] for ex in small_train_dataset]
tokenized_test['labels'] = [ex['label'] for ex in small_test_dataset]

# Convertir en objets Dataset
tokenized_train_dataset = Dataset.from_dict(tokenized_train)
tokenized_test_dataset = Dataset.from_dict(tokenized_test)

# Création d'un dictionnaire pour garder la structure initiale
tokenized_datasets = {
    "train": tokenized_train_dataset,
    "test": tokenized_test_dataset
}

# 🛠️ Préparation des données pour l'entraînement

## Utilisation de DataCollatorWithPadding

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# 🤖 Chargement du modèle pré-entraîné

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# ⚙️ Définition des paramètres d'entraînement

In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

# 📊 Évaluation sur les données de test

In [None]:
results = trainer.evaluate()
print(results)

# 👩‍🔬Test du modèle sur des exemples personnalisés

In [None]:
import torch

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return "Positif" if predicted_class == 1 else "Négatif"

# Test avec quelques phrases
examples = [
    "This movie was fantastic! I really enjoyed it.",
    "I hated this film, it was so bad.",
    "The plot was interesting, but the acting was terrible.",
    "An absolute masterpiece, one of the best movies I have ever seen.",
    "It was okay, nothing special but not the worst either.",
    "I fell asleep halfway through, it was so boring.",
    "Amazing cinematography and great performances from the cast.",
    "The story was confusing and hard to follow.",
    "A complete waste of time, I regret watching it.",
    "I laughed so much! This comedy was hilarious.",
    "The soundtrack was beautiful, but the script was weak.",
    "This horror movie actually scared me, great job!",
    "Way too predictable, I saw every twist coming.",
    "I wouldn't recommend this to anyone.",
    "Surprisingly good! I didn't expect to like it this much.",
    "The ending was disappointing, but the rest was solid.",
    "One of the worst movies of the year.",
    "A fresh and original take on the genre.",
    "The characters felt real and relatable.",
    "It tried too hard to be deep but ended up being pretentious."
]
for sentence in examples:
    print(f"Texte: {sentence}")
    print(f"Sentiment prédit: {predict_sentiment(sentence)}\n")