In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import TextVectorization, Embedding, SimpleRNN, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
import gensim.downloader as gensim_downloader
import gensim
import multiprocessing
from mlflow import MlflowClient
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from joblib import Parallel, delayed
from Source.preprocess_data import *  ## import all functions from preprocess_data.py
from Source.postprocess_data import * ## import all functions from postprocess_data.py
from Source.utils import *  ## import all functions from utils.py
import nltk
import optuna

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix 
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer, SnowballStemmer
from tqdm import tqdm
tqdm.pandas()
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

nw = multiprocessing.cpu_count()



client = MlflowClient(tracking_uri="http://localhost:8080")
os.environ["TF_KERAS"]='1'
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs disponibles :", tf.config.list_physical_devices("GPU"))
print("Version TF :", tf.__version__)

  from pkg_resources import parse_version
  from .autonotebook import tqdm as notebook_tqdm


2.10.1
Num GPUs Available:  1
GPUs disponibles : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Version TF : 2.10.1


In [2]:
df = pd.read_csv('https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip',
                header=None,
                compression='zip',
                encoding='cp1252')

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

sample_df, _ = train_test_split(df, test_size=0.9, random_state=42, stratify=df['target'])
sample_df = sample_df.reset_index(drop=True)
print(f"Sample size: {sample_df.shape[0]} rows")
# On ne garde que les colonnes 'target' et 'text'
sample_df = sample_df[['target', 'text']]
sample_df["target"] = sample_df["target"].apply(lambda x: 0 if x == 0 else 1)
sample_df.to_csv('Data/raw_data.csv', index=False)


Sample size: 160000 rows


# Séparation train/validation

In [3]:
# Data
X_raw = sample_df['text']
y = sample_df['target']
X_train, X_val, y_train, y_val = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)


In [13]:
type(X_val)

pandas.core.series.Series

# Préparation de l'experience de base (baseline)


## Pré-traitement des dataframes

In [4]:
num_words = 20000
max_len = 10
min_count = 3

X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

# Création d'un embedding de base custom pour notre modèle


In [5]:
from gensim.models import Word2Vec
latent_dim = 50
print("Build & train Word2Vec model ...")

w2v_model = Word2Vec(
    sentences=sentences_train, 
    vector_size=latent_dim,  # dimension de l’espace latent
    window=5,         # taille du contexte
    min_count=min_count,      # ignorer les mots trop rares
    workers=4,        # parallélisme CPU
    sg=0,              # 1 = skip-gram, 0 = CBOW
    epochs=100
)


model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

found = sum(1 for w in tokenizer.word_index if w in w2v_model.wv)
coverage = found / len(tokenizer.word_index)
print(f"Coverage: {coverage*100:.2f}%")

vectors = np.array([w2v_model.wv[w] for w in tokenizer.word_index if w in w2v_model.wv])
print("Mean norm:", np.mean(np.linalg.norm(vectors, axis=1)))


Build & train Word2Vec model ...
Vocabulary size: 13547
Word2Vec trained
Coverage: 99.99%
Mean norm: 7.0292816


## Création de la matrice d'embedding 

In [6]:
embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

Embedding matrix shape: (13549, 50)
Words found in pretrained embeddings: 13547/13549 (99.99%)


## Création du modèle simple avec RNN

In [7]:
model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = 64)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 50)            677450    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                7360      
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 684,875
Trainable params: 7,425
Non-trainable params: 677,450
_________________________________________________________________


## Callbacks pour l'entrainement

In [None]:



checkpoint = ModelCheckpoint("./Models/baselineRNN.h5", monitor='val_loss', verbose=0, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
optimizer = Adam(learning_rate=1e-3)

callbacks_list = [checkpoint, es, lr_scheduler]
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

## Entrainement

In [9]:
with tf.device("/GPU:0"):
    history = model.fit(X_sentence_train, y_train, epochs=50, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)


## Post-traitement

In [10]:
y_pred_proba = model.predict(X_sentence_val)
y_pred = (y_pred_proba>0.5)


output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

output_dict



{'Accuracy': 0.73446875,
 'F1_negatif': 0.7334107238101214,
 'F1_positif': 0.7355184113051327,
 'Recall_negatif': 0.7305,
 'Recall_positif': 0.7384375,
 'Precision_negatif': 0.7363447363447363,
 'Precision_positif': 0.7326223104111118,
 'ROC_AUC': 0.815024501953125}

## Liste des hyper-paramètres à optimiser 


- Prétraitement : 
    - Stemming ou Lemmatisation
    - Taille du vocabulaire (num_words)
    - Nombre minimum d'occurences (min_count)

- Embedding : 
    - Word2Vec/FastText/Glove (préentrainés)
    - Word2Vec/FastText (customisés)
    - Dimension latente de l'embedding
- Modèle : 
    - Couche SimpleRNN ou LSTM
    - Dimension de la couche d'entrainement
    - Fine-tuning ou non des embeddings ? 

# Experimentations sur les modèles de RNN classiques 

## Préparation

In [11]:

Stemmer_dict = {'WordNetLemmatizer': WordNetLemmatizer().lemmatize, 
                'PorterStemmer': PorterStemmer().stem, 
                'LancasterStemmer':LancasterStemmer().stem, 
                'SnowballStemmer' : SnowballStemmer("english").stem
}

list(Stemmer_dict.keys())

['WordNetLemmatizer', 'PorterStemmer', 'LancasterStemmer', 'SnowballStemmer']

## Core fonction

In [None]:
#Fonction à optimiser pour optuna

def embedding_eva_pre(trial):
    # Hyperparamètres
    ## Prétraitement
    min_count = trial.suggest_int('min_count',1,10)
    num_words = trial.suggest_int('num_words',5000,50000)
    max_len   = trial.suggest_int('max_len',2,30)
    stemmer   = trial.suggest_categorical('stemmer',list(Stemmer_dict.keys()))
    ## Embedding
    latent_dim = 50
    ## Modèle
    rnn_size = 64
    ## Entrainement
    epochs = 50
    lr = 1e-3
    ## Savepath des poids du modèle
    model_savepath = "./Models/baselineRNN_pre.h5"



    with mlflow.start_run(nested=True):
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': stemmer, 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr 
        })

        # Prétraitement
        X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=Stemmer_dict[stemmer],
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
        X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 
        # Embedding(custom)
        
        w2v_model = Word2Vec(
            sentences=sentences_train, 
            vector_size=latent_dim,  # dimension de l’espace latent
            window=5,         # taille du contexte
            min_count=min_count,      # ignorer les mots trop rares
            workers=4,        # parallélisme CPU
            sg=0,              # 1 = skip-gram, 0 = CBOW
            epochs=50
            )


        model_vectors = w2v_model.wv
        w2v_words = model_vectors.index_to_key

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        
        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

        # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.sklearn.log_model(model, "model")
        acc = output_dict["Accuracy"]
    return acc

## Définition de l'experiment MLFlow/Optuna

In [13]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("optuna_word_embedding_experiment_preprocessin")
exp_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_preprocessin").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on évalue simplement l'impact des différents prétraitements sur un modèle avec simpleRNN"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-preprocessing",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


## Lancement de l'optimisation

In [None]:
# Lancement de l'optimisation avec Optuna
print("Starting optimization trials...")
with mlflow.start_run(run_name="optuna_word_embedding_experiment_preprocessin"):
    study = optuna.create_study(direction="maximize")
    study.optimize(embedding_eva_pre, n_trials=50)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_accuracy", study.best_value)

print("Optimization completed.")

## Extraction meilleur modèle 

In [15]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_preprocessin").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_pre"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Best run ID: d9f27d7060494334af49d01586fdb855 with metrics:
Accuracy: 0.75290625
F1_negatif: 0.7471135702178016
F1_positif: 0.7584394953105429
Precision_negatif: 0.7650487980611776
Precision_positif: 0.7418275264447499
Recall_negatif: 0.73
Recall_positif: 0.7758125
ROC_AUC: 0.83355388671875
Best run parameters:
epochs: 50
latent_dim: 50
learning_rate: 0.001
max_len: 27
min_count: 1
num_words: 13006
rnn_size: 64
stemmer: PorterStemmer


Successfully registered model 'simple_rnn_best_pre'.
Created version '1' of model 'simple_rnn_best_pre'.


Setting tag epochs = 50 in registered model
Setting tag latent_dim = 50 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 27 in registered model
Setting tag min_count = 1 in registered model
Setting tag num_words = 13006 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = PorterStemmer in registered model


# Experimentation sur les embeddings (custom)


## Préparation

In [19]:
from gensim.models import FastText, Word2Vec

embedding_dict = {'Word2Vec':Word2Vec, 
                  'FastText':FastText}

## Core fonction

In [27]:
#Fonction à optimiser pour optuna

## Prétraitement
min_count = 5
num_words = 20000
max_len   = 30
stemmer   = 'PorterStemmer'
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

def embedding_eval_custom_embed(trial):
    # Hyperparamètres

    ## Embedding
    embedding_model = trial.suggest_categorical('embedding_model',list(embedding_dict.keys()))
    latent_dim = trial.suggest_int('latent_dim', 30, 150)
    window = trial.suggest_int("window", 2, 10)
    sg = trial.suggest_int('sg',0,1)

    ## Modèle
    rnn_size = 64
    ## Entrainement
    epochs = 50
    lr = 1e-3
    ## Savepath des poids du modèle
    model_savepath = "./Models/baselineRNN_pre.h5"



    with mlflow.start_run(nested=True):
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': stemmer, 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'embedding_model':embedding_model,
            'sg':sg,
            'window':window,
            'epochs': epochs, 
            'learning_rate': lr 
        })


        # Embedding(custom)
        if embedding_model=='Word2Vec':
            embedding_model = Word2Vec(
                sentences=sentences_train, 
                vector_size=latent_dim,  # dimension de l’espace latent
                window=5,         # taille du contexte
                min_count=min_count,      # ignorer les mots trop rares
            workers=4,        # parallélisme CPU
            sg=sg,              # 1 = skip-gram, 0 = CBOW
            epochs=30
            )
        elif embedding_model=='FastText':
            embedding_model = FastText(
                sentences=sentences_train, 
                vector_size=latent_dim, 
                window=5, 
                min_count=min_count,
                workers=4,
                sg=sg,
                epochs=30
                )


        model_vectors = embedding_model.wv
        w2v_words = model_vectors.index_to_key

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        
        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

        # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.sklearn.log_model(model, "model")
        acc = output_dict["Accuracy"]
    return acc

## Definition de l'experiment MLFlow/Optuna



In [28]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("optuna_word_embedding_experiment_custom_embedding")
exp_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_custom_embedding").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on évalue l'impact du type d'embedding custom et de la dimension de l'espace latent sur un modèle avec simpleRNN"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-preprocessing",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


## Lancement de l'optimisation

In [29]:
# Lancement de l'optimisation avec Optuna
print("Starting optimization trials...")
with mlflow.start_run(run_name="optuna_word_embedding_experiment_custom_embedding"):
    study = optuna.create_study(direction="maximize")
    study.optimize(embedding_eval_custom_embed, n_trials=30)
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_accuracy", study.best_value)

print("Optimization completed.")





INFO:tensorflow:Assets written to: ram://b70c20a6-0687-4b41-a51b-c3b2c379e863/assets


INFO:tensorflow:Assets written to: ram://b70c20a6-0687-4b41-a51b-c3b2c379e863/assets
[I 2025-09-14 23:15:42,662] Trial 20 finished with value: 0.75128125 and parameters: {'embedding_model': 'FastText', 'latent_dim': 105, 'window': 4, 'sg': 0}. Best is trial 20 with value: 0.75128125.


Embedding matrix shape: (9539, 107)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://90e3f848-a0c6-4c46-ac94-1d010557c7c6/assets


INFO:tensorflow:Assets written to: ram://90e3f848-a0c6-4c46-ac94-1d010557c7c6/assets
[I 2025-09-14 23:31:15,189] Trial 21 finished with value: 0.748125 and parameters: {'embedding_model': 'FastText', 'latent_dim': 107, 'window': 4, 'sg': 0}. Best is trial 20 with value: 0.75128125.


Embedding matrix shape: (9539, 128)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://f3b62c82-063e-404e-b5e3-5921370a300b/assets


INFO:tensorflow:Assets written to: ram://f3b62c82-063e-404e-b5e3-5921370a300b/assets
[I 2025-09-14 23:49:37,758] Trial 22 finished with value: 0.7520625 and parameters: {'embedding_model': 'FastText', 'latent_dim': 128, 'window': 3, 'sg': 0}. Best is trial 22 with value: 0.7520625.


Embedding matrix shape: (9539, 87)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://5de7e54b-cc75-4177-876e-aeeeb2b61353/assets


INFO:tensorflow:Assets written to: ram://5de7e54b-cc75-4177-876e-aeeeb2b61353/assets
[I 2025-09-15 00:09:37,160] Trial 23 finished with value: 0.75309375 and parameters: {'embedding_model': 'FastText', 'latent_dim': 87, 'window': 5, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 82)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://f1fbedb6-6c7c-4daf-a013-69dda45d9a38/assets


INFO:tensorflow:Assets written to: ram://f1fbedb6-6c7c-4daf-a013-69dda45d9a38/assets
[I 2025-09-15 00:34:43,999] Trial 24 finished with value: 0.74728125 and parameters: {'embedding_model': 'FastText', 'latent_dim': 82, 'window': 5, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 68)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://0e331421-f9d2-4161-b6ad-03bfa41c4815/assets


INFO:tensorflow:Assets written to: ram://0e331421-f9d2-4161-b6ad-03bfa41c4815/assets
[I 2025-09-15 00:52:46,131] Trial 25 finished with value: 0.746 and parameters: {'embedding_model': 'FastText', 'latent_dim': 68, 'window': 6, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 104)
Words found in pretrained embeddings: 9537/9539 (99.98%)




INFO:tensorflow:Assets written to: ram://55c7d27b-03bd-4a85-8351-997dc83de75e/assets


INFO:tensorflow:Assets written to: ram://55c7d27b-03bd-4a85-8351-997dc83de75e/assets
[I 2025-09-15 00:59:40,338] Trial 26 finished with value: 0.7395 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 104, 'window': 7, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 97)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://883d2e02-1fbc-480a-babd-3e54894ecd9e/assets


INFO:tensorflow:Assets written to: ram://883d2e02-1fbc-480a-babd-3e54894ecd9e/assets
[I 2025-09-15 01:14:51,913] Trial 27 finished with value: 0.750875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 97, 'window': 4, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 87)
Words found in pretrained embeddings: 9538/9539 (99.99%)




INFO:tensorflow:Assets written to: ram://16f2a0f0-884f-4643-b782-ce6d806cb19a/assets


INFO:tensorflow:Assets written to: ram://16f2a0f0-884f-4643-b782-ce6d806cb19a/assets
[I 2025-09-15 01:27:43,015] Trial 28 finished with value: 0.6356875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 87, 'window': 5, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Embedding matrix shape: (9539, 112)
Words found in pretrained embeddings: 9537/9539 (99.98%)




INFO:tensorflow:Assets written to: ram://6f8ba309-1b71-443b-8ca1-0ff6e75d953a/assets


INFO:tensorflow:Assets written to: ram://6f8ba309-1b71-443b-8ca1-0ff6e75d953a/assets
[I 2025-09-15 01:45:54,860] Trial 29 finished with value: 0.751 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 112, 'window': 7, 'sg': 0}. Best is trial 23 with value: 0.75309375.


Optimization completed.


In [30]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_custom_embedding").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_custom_embed"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Best run ID: 4461c64ebbfd43c884ba93c17bb3838a with metrics:
Accuracy: 0.75309375
F1_negatif: 0.7489594255393512
F1_positif: 0.7570941064346542
Precision_negatif: 0.7617139533380728
Precision_positif: 0.7450232952138924
Recall_negatif: 0.736625
Recall_positif: 0.7695625
ROC_AUC: 0.8303465761718751
Best run parameters:
embedding_model: FastText
epochs: 50
latent_dim: 87
learning_rate: 0.001
max_len: 30
min_count: 5
num_words: 20000
rnn_size: 64
sg: 0
stemmer: PorterStemmer
window: 5


Created version '1' of model 'simple_rnn_best_custom_embed'.


Setting tag embedding_model = FastText in registered model
Setting tag epochs = 50 in registered model
Setting tag latent_dim = 87 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 30 in registered model
Setting tag min_count = 5 in registered model
Setting tag num_words = 20000 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag sg = 0 in registered model
Setting tag stemmer = PorterStemmer in registered model
Setting tag window = 5 in registered model


# Experimentation sur les embeddings (préentrainés)

Les embeddings pré-entrainés ont été entrainés sur un très grand nombre de tweets et prennent donc en compte un très grand nombre de situations. On peut donc se passer de la phase de stemming et garder les stopwords. 


## Préparation

In [4]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Word2Vec Google News
w2v_google = api.load("word2vec-google-news-300")  # KeyedVectors

# FastText wiki-news subwords-300
ft_wiki = api.load("fasttext-wiki-news-subwords-300")

# GloVe Twitter (par exemple 200 dimensions)
glove_tw200 = api.load("glove-twitter-200")
glove_tw100 = api.load("glove-twitter-100")
glove_tw50  = api.load("glove-twitter-50")
glove_tw25  = api.load("glove-twitter-25")



In [57]:
embedding_dict = {'Word2Vec_google':w2v_google, 
                  'FastText_wiki':ft_wiki,
                  'Glove_twitter_200':glove_tw200, 
                  'Glove_twitter_100':glove_tw100, 
                  'Glove_twitter_50':glove_tw50, 
                  'Glove_twitter_25':glove_tw25
                  }

In [59]:
## Prétraitement
min_count = 2
num_words = 20000
max_len   = 30
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

## Fonction de base 

In [60]:
# Hyperparamètres

## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle
model_savepath = "./Models/baselineRNN_pretrained_embed.h5"

In [61]:
def pretrained_embed_experiment(embedding_name):
     with mlflow.start_run():
        latent_dim = embedding_dict[embedding_name].vector_size
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':embedding_name 
        })


        model_vectors = embedding_dict[embedding_name]

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Définition de l'experiment dans MLFlow

In [62]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("word_embedding_experiment_pretrained_embedding")
exp_id = mlflow.get_experiment_by_name("word_embedding_experiment_pretrained_embedding").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on teste plusieurs embeddings préentrainées sur de larges corpora, beaucoup d'attentes par rapport aux embeddings avec Glove entrainés sur des tweets"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


## Lancement de l'experiment

In [63]:
for embedding_name in list(embedding_dict.keys()):
    
    print(f"Running test with {embedding_name}")
    pretrained_embed_experiment(embedding_name)


Running test with Word2Vec_google
Embedding matrix shape: (24711, 300)
Words found in pretrained embeddings: 20292/24711 (82.12%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpft6gpi5_\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpft6gpi5_\model\data\model\assets


Running test with FastText_wiki
Embedding matrix shape: (24711, 300)
Words found in pretrained embeddings: 21410/24711 (86.64%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpg7xm_dcc\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpg7xm_dcc\model\data\model\assets


Running test with Glove_twitter_200
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp3_kfzaqj\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp3_kfzaqj\model\data\model\assets


Running test with Glove_twitter_100
Embedding matrix shape: (24711, 100)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpbuatvcxy\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpbuatvcxy\model\data\model\assets


Running test with Glove_twitter_50
Embedding matrix shape: (24711, 50)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpkkv9oxpy\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpkkv9oxpy\model\data\model\assets


Running test with Glove_twitter_25
Embedding matrix shape: (24711, 25)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpwkeji3ag\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpwkeji3ag\model\data\model\assets


## Enregistrement du meilleur modèle

In [64]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("word_embedding_experiment_pretrained_embedding").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_pre"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Best run ID: ce06b66ec4ff48e6a994d2485c63701f with metrics:
Accuracy: 0.795875
F1_negatif: 0.7963459499906467
F1_positif: 0.7954018668170143
Precision_negatif: 0.7945128779395296
Precision_positif: 0.7972497802335803
Recall_negatif: 0.7981875
Recall_positif: 0.7935625
ROC_AUC: 0.874631751953125
Best run parameters:
embedding_name: Glove_twitter_200
epochs: 50
latent_dim: 200
learning_rate: 0.001
max_len: 30
min_count: 2
num_words: 20000
rnn_size: 64
stemmer: None
Setting tag embedding_name = Glove_twitter_200 in registered model


Created version '2' of model 'simple_rnn_best_pre'.


Setting tag epochs = 50 in registered model
Setting tag latent_dim = 200 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 30 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 20000 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = None in registered model


# Comparaison des différentes couches de notre réseau de neurones : SimpleRNN vs GRU vs LSTM

## Préparation 

On reprend les paramètres d'embedding de la meilleure run sur embeddings customs et embeddings préentrainés. 

In [5]:
## Prétraitement
min_count = 2
num_words = 20000
max_len   = 30
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


model_vectors = glove_tw200
latent_dim = glove_tw200.vector_size

embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )

Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)


In [6]:
from Source.preprocess_data import *
rnn_layer_name_list = ['SimpleRNN','GRU','LSTM']


## Fonction de base

In [7]:
## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle

def rnn_layer_experiment(rnn_layer_name):
     with mlflow.start_run():
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':rnn_layer_name 
        })
        model_savepath = "./Models/"+rnn_layer_name+"_model_exp.h5"
        # Modèle
        if rnn_layer_name == 'SimpleRNN':
            model =  build_base_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name == 'GRU':
            model = build_gru_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name=='LSTM':
            model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Definition de l'experiment dans MLFLow

In [8]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("rnn_layer_experiment_pretrained_embedding")
exp_id = mlflow.get_experiment_by_name("rnn_layer_experiment_pretrained_embedding").experiment_id

experiment_description = (
    "Comparaison des impact des types de cellules RNN utilisées : SimpleRNN, GRU et LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "RNN_types-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Setting up MLflow experiment...


## Lancement de l'expériment

In [9]:
for rnn_layer_name in rnn_layer_name_list:
    
    print(f"Running test with {rnn_layer_name}")
    rnn_layer_experiment(rnn_layer_name)


Running test with SimpleRNN




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpzg36l18g\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpzg36l18g\model\data\model\assets


Running test with GRU




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpmp__baop\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpmp__baop\model\data\model\assets


Running test with LSTM




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmps1u9tisp\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmps1u9tisp\model\data\model\assets


# Retour sur la longueur des séquences     

Ici on va revenir sur la longueur des séquences utilisées car LSTM permet de garder des séquences plus longues sans pour autant avoir d'évanescence de gradient. 

## Fonction de base 

In [10]:


## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle

def lstm_maxlen_experiment(max_len):
     with mlflow.start_run():
        latent_dim = glove_tw200.vector_size

        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':'LSTM' 
        })

        ## Prétraitement

        # Prétraitement
        X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
        X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


        model_vectors = glove_tw200

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )
        model_savepath = f"./Models/{rnn_layer_name}_model_exp_len{max_len}.h5"
        # Modèle

        model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Experiment MLFLow



In [11]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("lstm_maxlen_experiment")
exp_id = mlflow.get_experiment_by_name("lstm_maxlen_experiment").experiment_id

experiment_description = (
    "Comparaison des impact des types de cellules RNN utilisées : SimpleRNN, GRU et LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "LSTM_pretrained_embedding",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Setting up MLflow experiment...


## Lancement de l'experiment 

In [12]:
for max_len in list(range(30,100,5)):
    
    print(f"Running test with sequence length of {max_len} tokens")
    lstm_maxlen_experiment(max_len)






INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpeobtdrra\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpeobtdrra\model\data\model\assets


Running test with sequence length of 75 tokens
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpbaxxk0ux\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpbaxxk0ux\model\data\model\assets


Running test with sequence length of 80 tokens
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpvujtzpq5\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpvujtzpq5\model\data\model\assets


Running test with sequence length of 85 tokens
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp4kp924jp\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp4kp924jp\model\data\model\assets


Running test with sequence length of 90 tokens
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0qlemuot\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0qlemuot\model\data\model\assets


Running test with sequence length of 95 tokens
Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpybz59rw1\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpybz59rw1\model\data\model\assets


## Enregistrement du mailleur modèle

In [30]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("lstm_maxlen_experiment").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "lstm_maxlen_best"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 366, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 464, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1634, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1627, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\bassm\.conda\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist."

Best run ID: b63acc4f64914a5fb77ab6b26a974e13 with metrics:
Accuracy: 0.81365625
F1_negatif: 0.811518159117489
F1_positif: 0.8157463770355035
Precision_negatif: 0.8209375199846518
Precision_positif: 0.8066980382570433
Recall_negatif: 0.8023125
Recall_positif: 0.825
ROC_AUC: 0.89485159765625
Best run parameters:
embedding_name: Glove_twitter_200
epochs: 50
latent_dim: 200
learning_rate: 0.001
max_len: 50
min_count: 2
num_words: 20000
rnn_layer_name: LSTM
rnn_size: 64
stemmer: None
Setting tag embedding_name = Glove_twitter_200 in registered model


Created version '1' of model 'lstm_maxlen_best'.


Setting tag epochs = 50 in registered model
Setting tag latent_dim = 200 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 50 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 20000 in registered model
Setting tag rnn_layer_name = LSTM in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = None in registered model


Quand on regarde les améliorations obtenues en augmentant la longueur des séquences, il n'est pas réellement pertinent d'augmenter la longueur des séquences au delà de 50 tokens. 

# Essai avec architecture Bidirectionnal-LSTM 

In [4]:
import gensim.downloader as api

# GloVe Twitter (par exemple 200 dimensions)
glove_tw200 = api.load("glove-twitter-200")

In [5]:
## Prétraitement
min_count = 2
num_words = 30000
max_len   = 50
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


model_vectors = glove_tw200
latent_dim = glove_tw200.vector_size

embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )

Embedding matrix shape: (24711, 200)
Words found in pretrained embeddings: 23716/24711 (95.97%)


In [6]:
## Modèle
rnn_size = 128
## Entrainement
epochs = 50
lr = 1e-3
max_len = 50
## Savepath des poids du modèle

def rnn_layer_experiment_bi(rnn_layer_name):
     with mlflow.start_run():
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':rnn_layer_name 
        })
        model_savepath = "./Models/"+rnn_layer_name+"_model_exp.h5"
        # Modèle

        if rnn_layer_name=='LSTM':
            model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name=='Bi-LSTM':
            model = build_bilstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)

        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=1)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

In [7]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("bilstm_experiment")
exp_id = mlflow.get_experiment_by_name("bilstm_experiment").experiment_id

experiment_description = (
    "Essai Bidirectionnal LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "RNN_types-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Setting up MLflow experiment...


In [9]:

print(f"Running test with Bi-LSTM")
rnn_layer_experiment_bi("Bi-LSTM")


Running test with Bi-LSTM












Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 200)           4942200   
                                                                 
 bidirectional_1 (Bidirectio  (None, 50, 256)          336896    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dropout_3 (Dropout)         (None, 64)               



INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpi_stprq1\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpi_stprq1\model\data\model\assets


In [10]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("bilstm_experiment").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "bilstm_best"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Best run ID: 54b89645ebab457780a374fdf83b5ef5 with metrics:
Accuracy: 0.8191875
F1_negatif: 0.822613281010485
F1_positif: 0.8156267924287808
Precision_negatif: 0.8073173667107956
Precision_positif: 0.8320114419451307
Recall_negatif: 0.8385
Recall_positif: 0.799875
ROC_AUC: 0.9025231015624999
Best run parameters:
embedding_name: Glove_twitter_200
epochs: 50
latent_dim: 200
learning_rate: 0.001
max_len: 50
min_count: 2
num_words: 30000
rnn_layer_name: Bi-LSTM
rnn_size: 128
stemmer: None
Setting tag embedding_name = Glove_twitter_200 in registered model


Created version '1' of model 'bilstm_best'.


Setting tag epochs = 50 in registered model
Setting tag latent_dim = 200 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 50 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 30000 in registered model
Setting tag rnn_layer_name = Bi-LSTM in registered model
Setting tag rnn_size = 128 in registered model
Setting tag stemmer = None in registered model


In [11]:
import joblib

# sauvegarde
joblib.dump(tokenizer, "exp_models/bilstm_tokenizer.pkl")

['exp_models/bilstm_tokenizer.pkl']

In [12]:
X_val

5523      Bit disappointed to find out at  BSL last nigh...
146829     The count down begins...10 days left in Europe! 
46086     Aw no izzy from greys is dieing. She's one of ...
42276     @PaulPunktastic urgh im working again today ti...
85720     Photo: fatalattraction: Â Lol yess, the good g...
                                ...                        
69398     So... I'm going to spin class tonight. I'm gua...
60318                           Visiting popstaronline.com 
29232                         PICTURE STILL NOT WORKING!!! 
100704    waiting for Chelsea to get here. Her flight wa...
56427     @BroadwayBlue ikr!!!! i miss our ducks so much.  
Name: text, Length: 32000, dtype: object