In [1]:
# Transformers (BERT / tokenization / training)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Datasets Hugging Face
from datasets import Dataset

# PyTorch (backend de BERT)
import torch




In [2]:
# Pipeline de résumé
from transformers import pipeline

# (optionnel mais souvent utile)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [3]:
import pandas as pd
import numpy as np


In [4]:
# ==============================
# Hugging Face cache (Windows-friendly)
# ==============================
import os

# Cache local pour éviter les re-téléchargements et les blocages Windows
HF_CACHE_DIR = os.path.join(os.getcwd(), "hf_cache")
os.makedirs(HF_CACHE_DIR, exist_ok=True)

# Désactive le warning symlinks (souvent bruyant sur Windows)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Force l'utilisation du cache local
os.environ["HF_HOME"] = HF_CACHE_DIR
os.environ["HF_HUB_CACHE"] = HF_CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = HF_CACHE_DIR


In [5]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

df = pd.read_csv("dataset.csv")

# Aperçu des données
df.head()


Unnamed: 0,text,label_encoded
0,drive to 'save' festive holidays efforts are b...,0
1,brown hits back in blair rift row gordon brown...,0
2,holmes is hit by hamstring injury kelly holmes...,1
3,the future in your pocket if you are a geek or...,2
4,o'sullivan could run in worlds sonia o'sulliva...,1


In [7]:
test_df = pd.read_csv("test_bert.csv")

# Harmoniser le nom de la colonne label
if 'label' in test_df.columns and 'label_encoded' not in test_df.columns:
    test_df = test_df.rename(columns={'label': 'label_encoded'})

# Vérif rapide
print('train columns:', df.columns.tolist())
print('test  columns:', test_df.columns.tolist())


train columns: ['text', 'label_encoded']
test  columns: ['text', 'label_encoded']


In [15]:
test_df["label_encoded"].value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,50
1,50
2,50


In [16]:
df["label_encoded"].value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,200
1,200
2,200


### Petit Rappel : Pour les classes 0,1,2 :

 0 ---> Politics

 1 ---> Sports

 2 ---> tech

### Répartition des classes

In [17]:

# Distribution des classes
df['label_encoded'].value_counts()




Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,200
1,200
2,200


In [18]:
label_map = {
    0: "Politics",
    1: "Sports",
    2: "Tech"
}

df["label_name"] = df["label_encoded"].map(label_map)
df["label_name"].value_counts()


Unnamed: 0_level_0,count
label_name,Unnamed: 1_level_1
Politics,200
Sports,200
Tech,200


## TOKENISATION :

On a choisi le modèle RoBERTa-base

In [19]:
from transformers import AutoTokenizer

MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=HF_CACHE_DIR)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [20]:
MAX_LENGTH = 256


In [21]:
from datasets import Dataset

# Train: tout dataset.csv (déjà chargé dans df)
train_ds = Dataset.from_pandas(df)

# Test: tout test_bert.csv (chargé dans test_df)
test_ds  = Dataset.from_pandas(test_df)

train_ds, test_ds


(Dataset({
     features: ['text', 'label_encoded', 'label_name'],
     num_rows: 600
 }),
 Dataset({
     features: ['text', 'label_encoded'],
     num_rows: 150
 }))

## Tokenisation :

In [22]:
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

train_ds = train_ds.map(tokenize_batch, batched=True)
test_ds  = test_ds.map(tokenize_batch, batched=True)

train_ds[0]


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

{'text': "drive to 'save' festive holidays efforts are being made to 'protect' workers' days off on christmas day and new year's day. support is being sought for a bill which would ensure that large retailers in scotland remain closed on christmas day and 1 january. the usdaw trade union said shop workers should be able to enjoy a break with their families. msp karen whitefield wants to ensure only those whose roles are essential are at work over the festive season. in recent years more stores have been opening on traditional holidays with some starting their endofyear sale on christmas day or new year's day. ms whitefield said i have found members of the public to be very supportive when i have been campaigning on the streets. the early evidence shows quite clearly that the vast majority of people believe that shop workers should be given these two special days to spend with friends and family. usdaw general secretary john hannett added christmas day and new year's day are special day

In [23]:
train_ds = train_ds.rename_column("label_encoded", "labels")
test_ds  = test_ds.rename_column("label_encoded", "labels")

# Garder uniquement ce qui sert au modèle
cols_to_keep = ["input_ids", "attention_mask", "labels"]
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in cols_to_keep])
test_ds  = test_ds.remove_columns([c for c in test_ds.column_names if c not in cols_to_keep])

train_ds, test_ds


(Dataset({
     features: ['labels', 'input_ids', 'attention_mask'],
     num_rows: 600
 }),
 Dataset({
     features: ['labels', 'input_ids', 'attention_mask'],
     num_rows: 150
 }))

### Classification en utilisant BERT

In [24]:
from transformers import AutoModelForSequenceClassification

label_map = {
    0: "Politics",
    1: "Sports",
    2: "Tech"
}
id2label = {k: v for k, v in label_map.items()}
label2id = {v: k for k, v in label_map.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    cache_dir=HF_CACHE_DIR,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# ==============================
# Entraînement + évaluation BERT / RoBERTa
# ==============================

from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np

# Mapping labels (cohérent avec ton modèle)
id2label = {0: "Politics", 1: "Sports", 2: "Tech"}
label2id = {"Politics": 0, "Sports": 1, "Tech": 2}

# Fonction de métriques
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="roberta_news_clf",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",
    eval_strategy="epoch",     # ⚠️ important (pas evaluation_strategy)
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ===== LANCEMENT =====
trainer.train()

# ===== ÉVALUATION FINALE =====
results = trainer.evaluate()
print("Résultats globaux :", results)

# ===== RAPPORT DÉTAILLÉ =====
pred = trainer.predict(test_ds)
y_true = pred.label_ids
y_pred = np.argmax(pred.predictions, axis=-1)

print("\nClassification report :")
print(classification_report(y_true, y_pred, target_names=["Politics", "Sports", "Tech"]))

print("\nConfusion matrix :")
print(confusion_matrix(y_true, y_pred))


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.001487,1.0,1.0
2,0.025400,0.000623,1.0,1.0




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.001487,1.0,1.0
2,0.025400,0.000623,1.0,1.0
3,0.006900,0.000539,1.0,1.0




Résultats globaux : {'eval_loss': 0.0014865838456898928, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_runtime': 85.3555, 'eval_samples_per_second': 1.757, 'eval_steps_per_second': 0.117, 'epoch': 3.0}





Classification report :
              precision    recall  f1-score   support

    Politics       1.00      1.00      1.00        50
      Sports       1.00      1.00      1.00        50
        Tech       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150


Confusion matrix :
[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]
