# Pipeline Complet — Détection de Langue (Darija CS Detection)Ce notebook consolide l'ensemble du pipeline :1. **Téléchargement** des datasets2. **Nettoyage et normalisation**3. **Séparation** train/valid/test4. **Annotation automatique** (silver labels)5. **Entraînement SVM** (baseline)6. **Fine-tuning Transformer** (XLM-RoBERTa)7. **Évaluation** et test de robustesse> **Classes cibles :** AR_DAR, AR_MSA, EN, FR

## 1. Imports et Configuration

In [None]:
import osimport sysimport reimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport joblibimport torchfrom tqdm.notebook import tqdmfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.svm import LinearSVCfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.preprocessing import FunctionTransformer, StandardScalerfrom sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrixfrom transformers import (    AutoTokenizer,    AutoModelForSequenceClassification,    TrainingArguments,    Trainer,    EarlyStoppingCallback,    set_seed,)from torch.utils.data import Dataset as TorchDatasetfrom datasets import load_dataset# Configsys.path.insert(0, os.getcwd())from src.normalizer import normalize, is_valid_textfrom src.utils import load_config, ensure_dirconfig = load_config()SEED = config["project"]["seed"]set_seed(SEED)print("✓ Imports OK")

## 2. Téléchargement des Datasets- **Atlaset** (HuggingFace ) — Darija marocain- **papluca/language-identification** — Français & Anglais- **NADI 2022** — Arabe standard (MSA)

In [None]:
# ── 2.1 Atlaset (Darija) ──atlaset_path = config["paths"]["data"]["atlaset"]if not os.path.exists(atlaset_path):    print("Téléchargement de Atlaset...")    ds = load_dataset("atlasia/Atlaset", split="train")    df_atlaset = ds.to_pandas()    ensure_dir(atlaset_path)    df_atlaset.to_parquet(atlaset_path, index=False)    print(f"✓ Atlaset sauvegardé ({len(df_atlaset)} lignes)")else:    print(f"✓ Atlaset déjà présent ({atlaset_path})")# ── 2.2 FR / EN ──fren_path = config["paths"]["data"]["fr_en"]if not os.path.exists(fren_path):    print("Téléchargement du dataset FR/EN...")    ds = load_dataset("papluca/language-identification")    dfs = []    for split in ds.keys():        df_s = ds[split].to_pandas()        df_s = df_s[df_s["labels"].isin(["fr", "en"])].copy()        df_s["labels"] = df_s["labels"].str.upper()        dfs.append(df_s)    df_fren = pd.concat(dfs, ignore_index=True)    df_fren = df_fren.rename(columns={"text": "text_raw", "labels": "label_original"})    ensure_dir(fren_path)    df_fren.to_parquet(fren_path, index=False)    print(f"✓ FR/EN sauvegardé ({len(df_fren)} lignes)")else:    print(f"✓ FR/EN déjà présent ({fren_path})")print("✓ Datasets prêts")

## 3. Nettoyage et NormalisationÉtapes de prétraitement :- Suppression des URLs, mentions- Lowercase des caractères latins- Normalisation de l'arabe (alef, ta marbuta, diacritiques)- Normalisation de l'arabizi (7→ح, 3→ع, etc.)- Filtrage des textes invalides

In [None]:
# Charger et combiner les sourcesdfs = []# Atlasetdf_a = pd.read_parquet(config["paths"]["data"]["atlaset"])sample_size = config.get("preprocessing", {}).get("atlaset_sample_size", 100000)if len(df_a) > sample_size:    df_a = df_a.sample(n=sample_size, random_state=SEED)if "text" in df_a.columns:    df_a = df_a.rename(columns={"text": "text_raw"})df_a["source"] = "Atlaset"df_a["label_original"] = "AR_DAR"df_a["id"] = [f"atlaset_{i}" for i in range(len(df_a))]dfs.append(df_a[["id", "text_raw", "label_original", "source"]])# FR / ENdf_fe = pd.read_parquet(config["paths"]["data"]["fr_en"])df_fe["source"] = "papluca"df_fe["id"] = [f"fren_{i}" for i in range(len(df_fe))]dfs.append(df_fe[["id", "text_raw", "label_original", "source"]])# NADI (si dispo)nadi_csv = os.path.join(config["paths"]["data"]["nadi"], "nadi_texts.csv")if os.path.exists(nadi_csv):    df_n = pd.read_csv(nadi_csv)    if "text" in df_n.columns:        df_n = df_n.rename(columns={"text": "text_raw"})    if "label" in df_n.columns:        df_n = df_n.rename(columns={"label": "label_original"})    df_n["source"] = "NADI2022"    df_n["id"] = [f"nadi_{i}" for i in range(len(df_n))]    dfs.append(df_n[["id", "text_raw", "label_original", "source"]])df_all = pd.concat(dfs, ignore_index=True).dropna(subset=["text_raw"])print(f"Total brut : {len(df_all)} phrases")# Filtrage textes invalidesdf_all = df_all[df_all["text_raw"].apply(is_valid_text)]print(f"Après filtrage : {len(df_all)} phrases")# Normalisationnorm_cfg = config.get("preprocessing", {})df_all["text_norm"] = df_all["text_raw"].apply(lambda t: normalize(    t,    lowercase_latin=norm_cfg.get("lowercase_latin", True),    rm_urls=norm_cfg.get("remove_urls", True),    rm_mentions=norm_cfg.get("remove_mentions", True),    rm_hashtags=norm_cfg.get("remove_hashtags", False),    norm_arabic=norm_cfg.get("normalize_arabic", True),    norm_arabizi=norm_cfg.get("normalize_arabizi", True),    keep_emojis=norm_cfg.get("keep_emojis", True),))df_all = df_all[df_all["text_norm"].str.len() > 0]print(f"Après normalisation : {len(df_all)} phrases")print("Répartition par source :")print(df_all["source"].value_counts().to_string())

## 4. Séparation Train / Valid / Test

In [None]:
from sklearn.model_selection import train_test_splittrain_ratio = config["data"]["splits"]["train"]valid_ratio = config["data"]["splits"]["valid"]test_ratio  = config["data"]["splits"]["test"]df_train, df_temp = train_test_split(df_all, test_size=(1 - train_ratio), random_state=SEED, stratify=df_all["label_original"])relative_valid = valid_ratio / (valid_ratio + test_ratio)df_valid, df_test = train_test_split(df_temp, test_size=(1 - relative_valid), random_state=SEED, stratify=df_temp["label_original"])print(f"Train : {len(df_train)} | Valid : {len(df_valid)} | Test : {len(df_test)}")print(f"Ratios : {len(df_train)/len(df_all):.0%} / {len(df_valid)/len(df_all):.0%} / {len(df_test)/len(df_all):.0%}")

## 5. Entraînement du Modèle SVM (Baseline)Features combinées :- **TF-IDF word** (n-grams 1-3)- **TF-IDF char** (n-grams 2-4)- **Script ratios** (arab/latin/code-switching)

In [None]:
def compute_script_ratios(texts):    """Calcule arab_ratio, latin_ratio, cs_ratio pour chaque texte."""    arab_pat = re.compile(r"[؀-ۿ]+")    latin_pat = re.compile(r"[a-zA-Z]+")    features = []    for text in texts:        tokens = str(text).split()        if not tokens:            features.append([0.0, 0.0, 0.0])            continue        n = len(tokens)        arab = sum(1 for t in tokens if arab_pat.search(t))        latin = sum(1 for t in tokens if latin_pat.search(t))        features.append([arab/n, latin/n, min(arab/n, latin/n)])    return np.array(features)# Paramètres depuis configngram_range = tuple(config["training"]["svm"]["ngram_range"])max_features = config["training"]["svm"]["max_features"]C = config["training"]["svm"]["C"]svm_pipeline = Pipeline([    ("features", FeatureUnion([        ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=ngram_range, max_features=max_features, sublinear_tf=True)),        ("tfidf_char", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), max_features=max_features, sublinear_tf=True)),        ("script_ratios", Pipeline([            ("compute", FunctionTransformer(compute_script_ratios, validate=False)),            ("scale", StandardScaler()),        ])),    ])),    ("svm", LinearSVC(C=C, random_state=SEED, dual="auto", max_iter=2000, class_weight="balanced")),])X_train_svm = df_train["text_norm"]y_train_svm = df_train["label_original"]print("Entraînement SVM...")svm_pipeline.fit(X_train_svm, y_train_svm)# Évaluation sur Validy_pred_valid = svm_pipeline.predict(df_valid["text_norm"])print(f"Accuracy Valid : {accuracy_score(df_valid['label_original'], y_pred_valid):.2%}")print(f"Macro F1 Valid : {f1_score(df_valid['label_original'], y_pred_valid, average='macro'):.2%}")print("" + classification_report(df_valid["label_original"], y_pred_valid))# Sauvegardesvm_path = config["paths"]["models"]["baseline_svm"]ensure_dir(svm_path)joblib.dump(svm_pipeline, svm_path)print(f"✓ SVM sauvegardé : {svm_path}")

In [None]:
# Matrice de confusion SVMlabels = sorted(df_valid["label_original"].unique())cm = confusion_matrix(df_valid["label_original"], y_pred_valid, labels=labels)plt.figure(figsize=(8, 6))sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)plt.title("SVM — Matrice de Confusion (Validation)")plt.ylabel("Vrai Label")plt.xlabel("Label Prédit")plt.tight_layout()plt.show()

## 6. Fine-tuning XLM-RoBERTaModèle :  (pré-entraîné sur 100 langues)Hyperparamètres :- Epochs : 3- Batch size : 16- Learning rate : 2e-5- Weight decay : 0.01- Early stopping : patience 2- Métrique : Macro F1

In [None]:
class LangDataset(TorchDataset):    def __init__(self, encodings, labels):        self.encodings = encodings        self.labels = labels    def __len__(self):        return len(self.labels)    def __getitem__(self, idx):        item = {k: v[idx] for k, v in self.encodings.items()}        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)        return itemdef compute_metrics(eval_pred):    logits, labels = eval_pred    preds = np.argmax(logits, axis=-1)    return {        "accuracy": accuracy_score(labels, preds),        "macro_f1": f1_score(labels, preds, average="macro"),    }# Encodage des labelsall_labels = sorted(df_train["label_original"].unique())label2id = {lbl: i for i, lbl in enumerate(all_labels)}id2label = {i: lbl for lbl, i in label2id.items()}num_labels = len(all_labels)y_train_t = df_train["label_original"].map(label2id).valuesy_valid_t = df_valid["label_original"].map(label2id).valuesprint(f"Classes : {label2id}")print(f"Train : {len(y_train_t)} | Valid : {len(y_valid_t)}")# Tokenisationmodel_name = config["training"]["transformer"]["model_name"]tokenizer = AutoTokenizer.from_pretrained(model_name)train_enc = tokenizer(df_train["text_norm"].tolist(), truncation=True, padding=True, max_length=128)valid_enc = tokenizer(df_valid["text_norm"].tolist(), truncation=True, padding=True, max_length=128)train_ds = LangDataset(train_enc, y_train_t)valid_ds = LangDataset(valid_enc, y_valid_t)print("✓ Tokenisation terminée")

In [None]:
# Modèlemodel = AutoModelForSequenceClassification.from_pretrained(    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id,)# Entraînementoutput_dir = config["paths"]["models"]["transformer"]training_args = TrainingArguments(    output_dir=output_dir,    num_train_epochs=config["training"]["transformer"]["epochs"],    per_device_train_batch_size=config["training"]["transformer"]["batch_size"],    per_device_eval_batch_size=32,    learning_rate=config["training"]["transformer"]["learning_rate"],    weight_decay=0.01,    eval_strategy="epoch",    save_strategy="epoch",    load_best_model_at_end=True,    metric_for_best_model="macro_f1",    greater_is_better=True,    logging_steps=100,    seed=SEED,    report_to="none",    fp16=torch.cuda.is_available(),)trainer = Trainer(    model=model,    args=training_args,    train_dataset=train_ds,    eval_dataset=valid_ds,    compute_metrics=compute_metrics,    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],)print("Début du fine-tuning...")trainer.train()print("✓ Entraînement terminé")# Évaluationresults = trainer.evaluate(valid_ds)print(f"Valid Accuracy : {results['eval_accuracy']:.2%}")print(f"Valid Macro F1 : {results['eval_macro_f1']:.2%}")

In [None]:
# Rapport de classification Transformerpreds = trainer.predict(valid_ds)y_pred_t = np.argmax(preds.predictions, axis=-1)labels_str = [id2label[i] for i in range(num_labels)]print(classification_report(y_valid_t, y_pred_t, target_names=labels_str))# Matrice de confusioncm_t = confusion_matrix(y_valid_t, y_pred_t)plt.figure(figsize=(8, 6))sns.heatmap(cm_t, annot=True, fmt="d", cmap="Blues", xticklabels=labels_str, yticklabels=labels_str)plt.title("XLM-RoBERTa — Matrice de Confusion (Validation)")plt.ylabel("Vrai Label")plt.xlabel("Label Prédit")plt.tight_layout()plt.show()# Sauvegarde du modèlefinal_path = os.path.join(output_dir, "best")trainer.save_model(final_path)tokenizer.save_pretrained(final_path)print(f"✓ Modèle sauvegardé : {final_path}")

## 7. Test de RobustesseTest des deux modèles sur des phrases complexes et cas limites.

In [None]:
# Charger les modèles finauxsvm_model = joblib.load(config["paths"]["models"]["baseline_svm"])trans_tokenizer = AutoTokenizer.from_pretrained(final_path)trans_model = AutoModelForSequenceClassification.from_pretrained(final_path)trans_model.eval()def predict_svm(text):    text_n = normalize(text)    return svm_model.predict([text_n])[0]def predict_transformer(text):    text_n = normalize(text)    inputs = trans_tokenizer(text_n, return_tensors="pt", truncation=True, max_length=128)    with torch.no_grad():        logits = trans_model(**inputs).logits[0]    probs = torch.softmax(logits, dim=-1)    conf, idx = torch.max(probs, dim=-1)    return trans_model.config.id2label[idx.item()], float(conf)# Phrases de testtest_phrases = [    ("واش نتا لاباس عليك؟", "AR_DAR"),    ("wach nta labas 3lik?", "AR_DAR"),    ("ما هي أهم التطورات في الذكاء الاصطناعي؟", "AR_MSA"),    ("Quels sont les avantages du machine learning?", "FR"),    ("What are the best NLP practices?", "EN"),    ("كنبغي نمشي للسوق باش نشري الخضرة", "AR_DAR"),    ("bghit nmchi l souk bach nchri lkhodra", "AR_DAR"),    ("يجب علينا تطوير البنية التحتية الرقمية", "AR_MSA"),    ("L'intelligence artificielle transforme notre quotidien", "FR"),    ("Deep learning has revolutionized computer vision", "EN"),]print(f"{'Phrase':<50} {'Attendu':<10} {'SVM':<10} {'Transformer':<15}")print("─" * 95)correct_svm, correct_trans = 0, 0for phrase, expected in test_phrases:    svm_pred = predict_svm(phrase)    trans_pred, conf = predict_transformer(phrase)    s_ok = "✓" if svm_pred == expected else "✗"    t_ok = "✓" if trans_pred == expected else "✗"    correct_svm += svm_pred == expected    correct_trans += trans_pred == expected    short = phrase[:47] + "..." if len(phrase) > 50 else phrase    print(f"{short:<50} {expected:<10} {s_ok} {svm_pred:<8} {t_ok} {trans_pred} ({conf:.0%})")print(f"Score SVM : {correct_svm}/{len(test_phrases)} | Transformer : {correct_trans}/{len(test_phrases)}")

## 8. Résumé| Modèle | Architecture | Features | Optimiseur ||---|---|---|---|| **SVM** | LinearSVC (C=1.0) | TF-IDF word (1-3) + char (2-4) + script ratios | Liblinear || **Transformer** | XLM-RoBERTa base | Tokenizer subword | AdamW (lr=2e-5, wd=0.01) |