In [1]:
# Fine-tuning AraBERT pour l'Analyse de Sentiments en Darija (Algérien & Marocain)
## 📦 Installation des dépendances

In [2]:
!pip install datasets
!pip install git+https://github.com/aub-mind/arabert@master#egg=arabert
!pip install scikit-learn

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [3]:
## 📚 Import des bibliothèques
!pip install transformers==4.26.0

Collecting transformers==4.26.0
  Downloading transformers-4.26.0-py3-none-any.whl.metadata (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.26.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall:

In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
from arabert.preprocess import ArabertPreprocessor

In [5]:
## 🧹 Chargement et prétraitement du dataset

In [6]:
# Chargement CSV
df = pd.read_csv("/content/darija_dataset_fusionne_cleaned.csv")

In [7]:
print(df.columns)

Index(['clean_text', 'final_label', 'source_langue', 'pays'], dtype='object')


In [11]:
from transformers import AutoTokenizer

# Utiliser un modèle existant et public
model_name = "aubmindlab/bert-base-arabertv02"  # 🟢 ce modèle fonctionne

# Charger le tokenizer sans erreur
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Nettoyage simple du texte
df['clean_text'] = df['clean_text'].astype(str).str.strip()


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
# Encodage des labels
labels = {label: i for i, label in enumerate(df['final_label'].unique())}
df['final_label'] = df['final_label'].map(labels)


In [13]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'],
    df['final_label'],
    test_size=0.2,
    stratify=df['final_label'],
    random_state=42
)


In [14]:
# Supprimer les doublons et les tweets trop courts
df = df.drop_duplicates(subset=["clean_text"])
df = df[df['clean_text'].str.split().apply(len) > 4]
df = df.dropna(subset=["clean_text", "final_label"])

In [15]:
# Dataset HuggingFace
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

In [16]:
## 🧠 Tokenisation avec AraBERT

In [18]:
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [20]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/13905 [00:00<?, ? examples/s]

Map:   0%|          | 0/3477 [00:00<?, ? examples/s]

In [21]:
## 📦 Modèle et configuration d'entraînement

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-bas

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # réduit pour améliorer la généralisation
    per_device_eval_batch_size=8,
    num_train_epochs=6,  # augmenter les époques
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [24]:
from sklearn.metrics import f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_weighted": f1_score(p.label_ids, preds, average='weighted')
    }


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
## 🚀 Entraînement

In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13905
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10434
  Number of trainable parameters = 135195651
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.748,0.803378,0.622088,0.614982
2,0.6974,0.794795,0.628128,0.622028
3,0.7033,0.78407,0.672994,0.672284
4,0.7092,0.770724,0.704918,0.70536
5,0.5732,0.804154,0.716135,0.715656
6,0.4276,0.872519,0.726201,0.726134


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3477
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1739
Configuration saved in ./results/checkpoint-1739/config.json
Model weights saved in ./results/checkpoint-1739/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1739/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1739/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3477
  Batch size = 8
Savin

TrainOutput(global_step=10434, training_loss=0.6335161420973138, metrics={'train_runtime': 1574.8496, 'train_samples_per_second': 52.976, 'train_steps_per_second': 6.625, 'total_flos': 2841595619538942.0, 'train_loss': 0.6335161420973138, 'epoch': 6.0})

In [27]:
## 📊 Évaluation

In [28]:
metrics = trainer.evaluate()
print(metrics)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3477
  Batch size = 8


{'eval_loss': 0.7707238793373108, 'eval_accuracy': 0.7049180327868853, 'eval_f1_weighted': 0.7053599601033936, 'eval_runtime': 14.8234, 'eval_samples_per_second': 234.561, 'eval_steps_per_second': 29.345, 'epoch': 6.0}


In [29]:
# Sauvegarder modèle et tokenizer
model_path = "./arabert-sentiment-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Créer un zip et proposer le téléchargement
!zip -r arabert_sentiment_model.zip arabert-sentiment-model
from google.colab import files
files.download("arabert_sentiment_model.zip")


Saving model checkpoint to ./arabert-sentiment-model
Configuration saved in ./arabert-sentiment-model/config.json
Model weights saved in ./arabert-sentiment-model/pytorch_model.bin
tokenizer config file saved in ./arabert-sentiment-model/tokenizer_config.json
Special tokens file saved in ./arabert-sentiment-model/special_tokens_map.json
tokenizer config file saved in ./arabert-sentiment-model/tokenizer_config.json
Special tokens file saved in ./arabert-sentiment-model/special_tokens_map.json


  adding: arabert-sentiment-model/ (stored 0%)
  adding: arabert-sentiment-model/config.json (deflated 51%)
  adding: arabert-sentiment-model/pytorch_model.bin (deflated 7%)
  adding: arabert-sentiment-model/training_args.bin (deflated 51%)
  adding: arabert-sentiment-model/vocab.txt (deflated 65%)
  adding: arabert-sentiment-model/special_tokens_map.json (deflated 42%)
  adding: arabert-sentiment-model/tokenizer.json (deflated 74%)
  adding: arabert-sentiment-model/tokenizer_config.json (deflated 42%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
import numpy as np

# Ton dictionnaire inverse des labels
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

text = "علابالي كاين وكابن أعز الناس"
result = classifier(text)[0]

# Convertir manuellement le label si nécessaire
label_id = int(result['label']) if isinstance(result['label'], (int, np.integer, np.int64)) else result['label']
label_name = id2label[label_id] if label_id in id2label else label_id

# Recréer le résultat final
final_result = [{'label': label_name, 'score': result['score']}]
print(final_result)


[{'label': 'positive', 'score': 0.9958439469337463}]


In [48]:
import numpy as np

# Ton dictionnaire inverse des labels
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

text ="تفو عليك"
result = classifier(text)[0]

# Convertir manuellement le label si nécessaire
label_id = int(result['label']) if isinstance(result['label'], (int, np.integer, np.int64)) else result['label']
label_name = id2label[label_id] if label_id in id2label else label_id

# Recréer le résultat final
final_result = [{'label': label_name, 'score': result['score']}]
print(final_result)


[{'label': 'negative', 'score': 0.6323014497756958}]


In [57]:
import numpy as np

# Ton dictionnaire inverse des labels
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

text = "واش نتي مريضة فراسك"
result = classifier(text)[0]

# Convertir manuellement le label si nécessaire
label_id = int(result['label']) if isinstance(result['label'], (int, np.integer, np.int64)) else result['label']
label_name = id2label[label_id] if label_id in id2label else label_id

# Recréer le résultat final
final_result = [{'label': label_name, 'score': result['score']}]
print(final_result)


[{'label': 'negative', 'score': 0.9905630350112915}]


In [58]:
import numpy as np

# Ton dictionnaire inverse des labels
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

text = "ليوما متغديتش"
result = classifier(text)[0]

# Convertir manuellement le label si nécessaire
label_id = int(result['label']) if isinstance(result['label'], (int, np.integer, np.int64)) else result['label']
label_name = id2label[label_id] if label_id in id2label else label_id

# Recréer le résultat final
final_result = [{'label': label_name, 'score': result['score']}]
print(final_result)


[{'label': 'neutral', 'score': 0.7789591550827026}]


In [63]:
import numpy as np

# Ton dictionnaire inverse des labels
id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

text = "lah insar sidna"
result = classifier(text)[0]

# Convertir manuellement le label si nécessaire
label_id = int(result['label']) if isinstance(result['label'], (int, np.integer, np.int64)) else result['label']
label_name = id2label[label_id] if label_id in id2label else label_id

# Recréer le résultat final
final_result = [{'label': label_name, 'score': result['score']}]
print(final_result)


[{'label': 'positive', 'score': 0.7576006054878235}]


In [69]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import numpy as np

# ✅ Charger le modèle et le tokenizer depuis le dossier sauvegardé
model_path = "/content/arabert-sentiment-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# ✅ Charger la pipeline (GPU si disponible)
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

# ✅ Re-mappage des labels (à adapter selon l’ordre dans ton dataset)
# Exemple : labels = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

# ✅ Exemple de prédiction
text = "nti 3ziza 3liya"
result = classifier(text)[0]

# ✅ Convertir l'index en label lisible
label_index = int(result['label'].replace("LABEL_", "")) if "LABEL_" in result['label'] else int(result['label'])
result['label'] = id2label[label_index]

# ✅ Afficher le résultat final
print(result)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/arabert-sentiment-model/config.json
Model config BertConfig {
  "_name_or_path": "/content/arabert-sentiment-model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "floa

{'label': 'positive', 'score': 0.6088584661483765}
