In [1]:
!pip install -U datasets



In [2]:
from datasets import load_dataset
import pandas as pd

In [4]:
# Load the UCI SMS Spam dataset (sms_spam) from Hugging Face hub
dataset = load_dataset('ucirvine/sms_spam')


# We'll use 4,000 for train, 1,000 for validation
train_ds = dataset['train'].select(range(4000))
val_ds   = dataset['train'].select(range(4000, 5000))

print(train_ds.features)

{'sms': Value('string'), 'label': ClassLabel(names=['ham', 'spam'])}


In [5]:
%pip install --quiet evaluate transformers[sentencepiece]

In [6]:
df_train = pd.DataFrame(train_ds)
df_val = pd.DataFrame(val_ds)
df_train.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from transformers import GPT2Tokenizer


model_name = 'gpt2'
tokenizer  = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(examples):
    return tokenizer(
        examples["sms"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok   = val_ds.map(tokenize_fn, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
import torch
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2, #ham or spam
    pad_token_id=tokenizer.eos_token_id
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import evaluate
import numpy as np

accuracy  = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall    = evaluate.load("recall")
f1        = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"],
        "recall":    recall.compute(predictions=preds, references=labels)["recall"],
        "f1":        f1.compute(predictions=preds, references=labels)["f1"]
    }

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,                 # turn on training
    do_eval=True,                  # turn on evaluation
    eval_steps=500,                # run .evaluate() every 500 steps
    save_steps=500,                # save a checkpoint every 500 steps
    logging_dir="./logs",
    logging_steps=500,
    # log metrics every 500 steps

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,

    report_to="none",                # disable integrations
    save_total_limit=1,            # only keep last checkpoint
)

Pour le weight_decay je mets 0.01 pour eviter l'overfitting

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)
trainer.train()

#Evaluate
metrics = trainer.evaluate()
print(metrics)

Step,Training Loss


{'eval_loss': 0.07180759310722351, 'eval_accuracy': 0.991, 'eval_precision': 0.9924242424242424, 'eval_recall': 0.9424460431654677, 'eval_f1': 0.966789667896679, 'eval_runtime': 4.1701, 'eval_samples_per_second': 239.805, 'eval_steps_per_second': 7.674, 'epoch': 3.0}


In [15]:
import torch

# Quelques SMS à tester
test_sms = [
    "You’ve won a $1,000 Walmart gift card! Click here to claim now.",
    "Act now! Pre-approved loan offer up to $50,000. No credit check!",
    "Winner! You've been chosen for a cash prize. Call 0900-XXX-XXXX now.",
    "Your number was selected! Text ‘WIN’ to 40404 to get your prize."
]

# Prétraitement (tokenisation)
inputs = tokenizer(
    test_sms,
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

# Envoie sur GPU si dispo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Prédictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Affichage des résultats
for sms, pred in zip(test_sms, predictions):
    label = "SPAM" if pred.item() == 1 else "HAM"
    print(f"Message : {sms}\n→ Prédiction : {label}\n")


Message : You’ve won a $1,000 Walmart gift card! Click here to claim now.
→ Prédiction : HAM

Message : Act now! Pre-approved loan offer up to $50,000. No credit check!
→ Prédiction : SPAM

Message : Winner! You've been chosen for a cash prize. Call 0900-XXX-XXXX now.
→ Prédiction : SPAM

Message : Your number was selected! Text ‘WIN’ to 40404 to get your prize.
→ Prédiction : SPAM



Bon j'ai refais * 3 mon model et le premier message n'est pas bon