In [1]:
import pandas as pd
import mlflow
from transformers import AutoTokenizer
import random

import mlflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from transformers import TFDistilBertModel, DistilBertTokenizer
import numpy as np
import os

import mlflow
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer
import tensorflow as tf
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
df = pd.read_csv("dataset/test_50K.csv")

In [3]:
# Configuration MLflow
mlflow.set_tracking_uri("file:///Users/skanderzahi/Desktop/P7/projet/mlruns")
mlflow.set_experiment("p7_air_paradis")

<Experiment: artifact_location='file:///Users/skanderzahi/Desktop/P7/projet/mlruns/379503310426968982', creation_time=1760449044952, experiment_id='379503310426968982', last_update_time=1760449044952, lifecycle_stage='active', name='p7_air_paradis', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [5]:
# Récupérer le meilleur modèle selon test_precision
exp = mlflow.get_experiment_by_name("p7_air_paradis")
runs_df = mlflow.search_runs(
    [exp.experiment_id],
    filter_string="attributes.status = 'FINISHED'",
    order_by=["metrics.test_precision DESC"],
    max_results=1,
)

In [None]:
best_run_id = runs_df.iloc[0]["run_id"]
best_model_name = runs_df.iloc[0]["tags.mlflow.runName"]
best_test_precision = runs_df.iloc[0]["metrics.test_precision"]
MAX_LENGTH = int(runs_df.iloc[0]["params.max_length"])

print(f"Meilleur modèle : {best_model_name}")
print(f"Run ID : {best_run_id}")
print(f"Test Precision : {best_test_precision:.4f}")
print(f"MAX_LENGTH : {MAX_LENGTH}")

Meilleur modèle : DL_DistilBERT_trainable_EPOCHS_3_128_best
Run ID : 14729258203f4ea297b9d998dd3b93ba
Test Precision : 0.8542


In [8]:
# Initialiser le tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Sauvegarder le tokenizer localement
tokenizer_path = "distilbert_tokenizer"
tokenizer.save_pretrained(tokenizer_path)

('distilbert_tokenizer/tokenizer_config.json',
 'distilbert_tokenizer/special_tokens_map.json',
 'distilbert_tokenizer/vocab.txt',
 'distilbert_tokenizer/added_tokens.json')

In [10]:
# Ajouter le tokenizer au run MLflow
with mlflow.start_run(run_id=best_run_id):
    mlflow.log_artifact(tokenizer_path, artifact_path="tokenizer")
    print("Tokenizer sauvegardé dans MLflow")

print(f"\nTokenizer sauvegardé dans : {tokenizer_path}")

Tokenizer sauvegardé dans MLflow

Tokenizer sauvegardé dans : distilbert_tokenizer


In [None]:
# CHARGER LE MODÈLE (Méthode directe avec le chemin local)
client = mlflow.tracking.MlflowClient()
model_path = client.download_artifacts(best_run_id, "model/data/model")

print(f"Modèle téléchargé : {model_path}")

# Charger avec custom_objects pour gérer les couches transformers
from transformers import TFDistilBertModel

loaded_model = keras.models.load_model(
    model_path, custom_objects={"TFDistilBertModel": TFDistilBertModel}
)

print("Modèle chargé avec succès !")
print(f"\nSummary du modèle :")
loaded_model.summary()

# CHARGER LE TOKENIZER
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer chargé")


Meilleur modèle : DL_DistilBERT_trainable_EPOCHS_3_128_best
Run ID : 14729258203f4ea297b9d998dd3b93ba
Test Precision : 0.8542
MAX_LENGTH : 128


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:01<00:00,  3.14it/s]   


Modèle téléchargé : /var/folders/3g/rdq178bx3690jwjlb35tlzqw0000gp/T/tmpkftvfmls/model/data/model


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Modèle chargé avec succès !

Summary du modèle :
Model: "DistilBERT_Classifier"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 128, 768)   0          'attention_mask[0][0]']      
                             

# Prédiction

In [52]:
def predict_sentiment(text, model, tokenizer, max_length=128):
    # Tokenization
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="np",
    )

    # Prédiction
    prediction = model.predict(
        [encoding["input_ids"], encoding["attention_mask"]], verbose=0
    )

    proba = prediction[0][0]
    label = "Positif" if proba > 0.5 else "Négatif"

    return {
        "text": text,
        "probabilite": float(proba),
        "prediction": label,
        "confiance": float(max(proba, 1 - proba)),
    }

In [55]:
# TEST SUR UN TEXTE SIMPLE
# test_text = "This movie was absolutely amazing! Best film ever!"
test_text = "This movie was absolutely bad!"
result = predict_sentiment(test_text, loaded_model, tokenizer, MAX_LENGTH)

print(f"\n Texte : {test_text}")
print(f"Prédiction : {result['prediction']}")
print(f"Probabilité : {result['probabilite']:.4f}")
print(f"Confiance : {result['confiance']:.2%}")


 Texte : This movie was absolutely bad!
Prédiction : Négatif
Probabilité : 0.0063
Confiance : 99.37%


In [56]:
# TEST SUR UN EXEMPLE ALÉATOIRE DU DATAFRAME

random_idx = np.random.randint(0, len(df))
random_text = df.iloc[random_idx]["text"]
true_label = df.iloc[random_idx]["target"]

print(f"\nIndex : {random_idx}")
print(f"Texte : {random_text[:MAX_LENGTH]}")
print(f"\nLabel réel : {'Positif' if true_label == 1 else 'Négatif'}")

# Prédiction
result = predict_sentiment(random_text, loaded_model, tokenizer, MAX_LENGTH)

print(f"Prédiction : {result['prediction']}")
print(f"\nProbabilité : {result['probabilite']:.4f}")
print(f"Confiance : {result['confiance']:.2%}")

# Vérification
is_correct = (result["prediction"] == "Positif" and true_label == 1) or (
    result["prediction"] == "Négatif" and true_label == 0
)

print(f"\n{'PRÉDICTION CORRECTE !' if is_correct else 'PRÉDICTION INCORRECTE'}")


Index : 4164
Texte : I'm a busy bug lately. tsssssssss! 

Label réel : Négatif
Prédiction : Négatif

Probabilité : 0.0427
Confiance : 95.73%

PRÉDICTION CORRECTE !


In [64]:
# TEST SUR PLUSIEURS EXEMPLES
n_samples = 10
random_indices = np.random.choice(len(df), n_samples, replace=False)

correct_predictions = 0
results_list = []

for i, idx in enumerate(random_indices, 1):
    text = df.iloc[idx]["text"]
    true_label = df.iloc[idx]["target"]

    result = predict_sentiment(text, loaded_model, tokenizer, MAX_LENGTH)

    is_correct = (result["prediction"] == "Positif" and true_label == 1) or (
        result["prediction"] == "Négatif" and true_label == 0
    )

    if is_correct:
        correct_predictions += 1

    true_label_str = "Positif" if true_label == 1 else "Négatif"
    status = "✅" if is_correct else "❌"

    print(f"\n{i}. {status} Texte : {text[:MAX_LENGTH]}")
    print(
        f"   Vrai: {true_label_str:8} | Prédit: {result['prediction']:8} | "
        f"Proba: {result['probabilite']:.3f}"
    )

    results_list.append(
        {
            "text": text[:100],
            "true": true_label_str,
            "predicted": result["prediction"],
            "proba": result["probabilite"],
            "correct": is_correct,
        }
    )

accuracy = correct_predictions / n_samples
print(f"\n{'='*80}")
print(f" RÉSULTAT : {correct_predictions}/{n_samples} prédictions correctes")
print(f"   Précision sur ces exemples : {accuracy:.1%}")
print(f"{'='*80}")


1. ✅ Texte : hollyoaks was good, got a headache 
   Vrai: Négatif  | Prédit: Négatif  | Proba: 0.039

2. ❌ Texte : Wenn ich &quot;Have you ever seen the rain&quot; hÃ¶re, vermisse ich Stargate.  http://bit.ly/DS7Wv
   Vrai: Négatif  | Prédit: Positif  | Proba: 0.535

3. ✅ Texte : I have to genius bar my MacBook Pro on saturday. Loud fan sound coming out of left side. 
   Vrai: Négatif  | Prédit: Négatif  | Proba: 0.117

4. ❌ Texte : 3.5 days break :O back to usual life!Some of my friends are becoming prays of fishing in orkut  careful guys,use online account 
   Vrai: Négatif  | Prédit: Positif  | Proba: 0.738

5. ✅ Texte : @SharonMc Great the transplating worked.  I think I'm too heavy handed with the roots when I do it. A lovely memory 
   Vrai: Positif  | Prédit: Positif  | Proba: 0.959

6. ✅ Texte : @amazingphil You should do that!  But, with polka dots. (:
   Vrai: Positif  | Prédit: Positif  | Proba: 0.833

7. ✅ Texte : Yeah, I got new meds, and they make me a little sleepy, slo