In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import TextVectorization, Embedding, SimpleRNN, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
import gensim.downloader as gensim_downloader
import gensim
import multiprocessing
from mlflow import MlflowClient
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import optuna

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix 
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer, SnowballStemmer
from tqdm import tqdm
tqdm.pandas()


import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"


import sys
from pathlib import Path
cwd = Path.cwd()
parent = cwd.parent
sys.path.append(str(parent))


from Source.preprocess_data import *  ## import all functions from preprocess_data.py
from Source.postprocess_data import * ## import all functions from postprocess_data.py
from Source.utils import *  ## import all functions from utils.py
import nltk


client = MlflowClient(tracking_uri="http://localhost:8080")

mlruns_path = Path("../mlruns").resolve() 
mlflow_uri = mlruns_path.as_uri()
mlflow.set_tracking_uri(mlflow_uri)

nw = multiprocessing.cpu_count()



os.environ["TF_KERAS"]='1'
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs disponibles :", tf.config.list_physical_devices("GPU"))
print("Version TF :", tf.__version__)

  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import parse_version


2.10.1
Num GPUs Available:  1
GPUs disponibles : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Version TF : 2.10.1


In [2]:
df = pd.read_csv('https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip',
                header=None,
                compression='zip',
                encoding='cp1252')

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
data_size = 0.02
sample_df, _ = train_test_split(df, test_size=1-data_size, random_state=42, stratify=df['target'])
sample_df = sample_df.reset_index(drop=True)
print(f"Sample size: {sample_df.shape[0]} rows")
data_numrows = sample_df.shape[0]
# On ne garde que les colonnes 'target' et 'text'
sample_df = sample_df[['target', 'text']]
sample_df["target"] = sample_df["target"].apply(lambda x: 0 if x == 0 else 1)
sample_df.to_csv('../Data/raw_data.csv', index=False)
dataset = mlflow.data.from_pandas(
    sample_df,
    source="../Data/raw_data.csv",
    name="dataset_v1"
)


Sample size: 32000 rows


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


# Séparation train/validation

In [3]:
# Data
X_raw = sample_df['text']
y = sample_df['target']
X_train, X_val, y_train, y_val = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
type(X_val)

pandas.core.series.Series

# Préparation de l'experience de base (baseline)


## Pré-traitement des dataframes

In [5]:
num_words = 20000
max_len = 10
min_count = 3

X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

# Création d'un embedding de base custom pour notre modèle


In [6]:
from gensim.models import Word2Vec
latent_dim = 50
print("Build & train Word2Vec model ...")

w2v_model = Word2Vec(
    sentences=sentences_train, 
    vector_size=latent_dim,  # dimension de l’espace latent
    window=5,         # taille du contexte
    min_count=min_count,      # ignorer les mots trop rares
    workers=4,        # parallélisme CPU
    sg=0,              # 1 = skip-gram, 0 = CBOW
    epochs=100
)


model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

found = sum(1 for w in tokenizer.word_index if w in w2v_model.wv)
coverage = found / len(tokenizer.word_index)
print(f"Coverage: {coverage*100:.2f}%")

vectors = np.array([w2v_model.wv[w] for w in tokenizer.word_index if w in w2v_model.wv])
print("Mean norm:", np.mean(np.linalg.norm(vectors, axis=1)))


Build & train Word2Vec model ...
Vocabulary size: 5491
Word2Vec trained
Coverage: 99.98%
Mean norm: 6.3330092


## Création de la matrice d'embedding 

In [7]:
embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

Embedding matrix shape: (5493, 50)
Words found in pretrained embeddings: 5491/5493 (99.96%)


## Création du modèle simple avec RNN

In [8]:
model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = 64)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 50)            274650    
                                                                 
 simple_rnn (SimpleRNN)      (None, 10, 64)            7360      
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 282,075
Trainable params: 7,425
Non-trainable params: 274,650
_________________________________________________________________


## Callbacks pour l'entrainement

In [9]:



checkpoint = ModelCheckpoint("./Models/baselineRNN.h5", monitor='val_loss', verbose=0, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
optimizer = Adam(learning_rate=1e-3)

callbacks_list = [checkpoint, es, lr_scheduler]
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

## Entrainement

In [10]:
with tf.device("/GPU:0"):
    history = model.fit(X_sentence_train, y_train, epochs=50, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)


## Post-traitement

In [11]:
y_pred_proba = model.predict(X_sentence_val)
y_pred = (y_pred_proba>0.5)


output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

output_dict



{'Accuracy': 0.70421875,
 'F1_negatif': 0.7044496487119438,
 'F1_positif': 0.7039874902267397,
 'Recall_negatif': 0.705,
 'Recall_positif': 0.7034375,
 'Precision_negatif': 0.7039001560062402,
 'Precision_positif': 0.7045383411580595,
 'ROC_AUC': 0.777373828125}

## Liste des hyper-paramètres à optimiser 


- Prétraitement : 
    - Stemming ou Lemmatisation
    - Taille du vocabulaire (num_words)
    - Nombre minimum d'occurences (min_count)

- Embedding : 
    - Word2Vec/FastText/Glove (préentrainés)
    - Word2Vec/FastText (customisés)
    - Dimension latente de l'embedding
- Modèle : 
    - Couche SimpleRNN ou LSTM
    - Dimension de la couche d'entrainement
    - Fine-tuning ou non des embeddings ? 

# Experimentations sur les modèles de RNN classiques 

## Préparation

In [12]:

Stemmer_dict = {'WordNetLemmatizer': WordNetLemmatizer().lemmatize, 
                'PorterStemmer': PorterStemmer().stem, 
                'LancasterStemmer':LancasterStemmer().stem, 
                'SnowballStemmer' : SnowballStemmer("english").stem
}

list(Stemmer_dict.keys())

['WordNetLemmatizer', 'PorterStemmer', 'LancasterStemmer', 'SnowballStemmer']

## Core fonction

In [13]:
#Fonction à optimiser pour optuna

def embedding_eva_pre(trial):
    # Hyperparamètres
    ## Prétraitement
    min_count = trial.suggest_int('min_count',1,10)
    num_words = trial.suggest_int('num_words',5000,50000)
    max_len   = trial.suggest_int('max_len',2,30)
    stemmer   = trial.suggest_categorical('stemmer',list(Stemmer_dict.keys()))
    ## Embedding
    latent_dim = 50
    ## Modèle
    rnn_size = 64
    ## Entrainement
    epochs = 50
    lr = 1e-3
    ## Savepath des poids du modèle
    model_savepath = "./Models/baselineRNN_pre.h5"



    with mlflow.start_run(nested=True):
        mlflow.log_input(dataset)

        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': stemmer, 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            "data_size": data_size,
            "data_numrows": data_numrows, 
        })

        # Prétraitement
        X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=Stemmer_dict[stemmer],
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
        X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 
        # Embedding(custom)
        
        w2v_model = Word2Vec(
            sentences=sentences_train, 
            vector_size=latent_dim,  # dimension de l’espace latent
            window=5,         # taille du contexte
            min_count=min_count,      # ignorer les mots trop rares
            workers=4,        # parallélisme CPU
            sg=0,              # 1 = skip-gram, 0 = CBOW
            epochs=50
            )


        model_vectors = w2v_model.wv
        w2v_words = model_vectors.index_to_key

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        
        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

        # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.sklearn.log_model(model, "model")
        acc = output_dict["Accuracy"]
    return acc

## Définition de l'experiment MLFlow/Optuna

In [14]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("optuna_word_embedding_experiment_preprocessin")
exp_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_preprocessin").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on évalue simplement l'impact des différents prétraitements sur un modèle avec simpleRNN"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-preprocessing",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

## Lancement de l'optimisation

In [15]:
# Lancement de l'optimisation avec Optuna
print("Starting optimization trials...")
with mlflow.start_run(run_name="optuna_word_embedding_experiment_preprocessin"):
    study = optuna.create_study(direction="maximize")
    study.optimize(embedding_eva_pre, n_trials=50)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_accuracy", study.best_value)

print("Optimization completed.")

[I 2025-10-06 13:55:58,752] Trial 27 finished with value: 0.709375 and parameters: {'min_count': 4, 'num_words': 36201, 'max_len': 20, 'stemmer': 'PorterStemmer'}. Best is trial 15 with value: 0.7209375.


Embedding matrix shape: (3622, 50)
Words found in pretrained embeddings: 3620/3622 (99.94%)




INFO:tensorflow:Assets written to: ram://881c32db-fdf1-4e2a-87cb-24dd87795d63/assets


INFO:tensorflow:Assets written to: ram://881c32db-fdf1-4e2a-87cb-24dd87795d63/assets
[I 2025-10-06 13:58:43,259] Trial 28 finished with value: 0.71171875 and parameters: {'min_count': 5, 'num_words': 24064, 'max_len': 26, 'stemmer': 'PorterStemmer'}. Best is trial 15 with value: 0.7209375.


Embedding matrix shape: (5493, 50)
Words found in pretrained embeddings: 5491/5493 (99.96%)




INFO:tensorflow:Assets written to: ram://e7fd3748-b2c7-4407-a356-0fcaf406b1c5/assets


INFO:tensorflow:Assets written to: ram://e7fd3748-b2c7-4407-a356-0fcaf406b1c5/assets
[I 2025-10-06 14:01:20,342] Trial 29 finished with value: 0.7221875 and parameters: {'min_count': 3, 'num_words': 18037, 'max_len': 17, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (3144, 50)
Words found in pretrained embeddings: 3142/3144 (99.94%)




INFO:tensorflow:Assets written to: ram://212140ed-88a9-428e-83f8-d4dc6e775936/assets


INFO:tensorflow:Assets written to: ram://212140ed-88a9-428e-83f8-d4dc6e775936/assets
[I 2025-10-06 14:03:58,701] Trial 30 finished with value: 0.70984375 and parameters: {'min_count': 6, 'num_words': 10649, 'max_len': 16, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (5493, 50)
Words found in pretrained embeddings: 5491/5493 (99.96%)




INFO:tensorflow:Assets written to: ram://5a953a11-b0a7-4878-9f89-fa536c59aece/assets


INFO:tensorflow:Assets written to: ram://5a953a11-b0a7-4878-9f89-fa536c59aece/assets
[I 2025-10-06 14:06:43,178] Trial 31 finished with value: 0.711875 and parameters: {'min_count': 3, 'num_words': 18164, 'max_len': 19, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (7727, 50)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: ram://882fe8da-6e30-4b53-bccb-d745ae60b6ea/assets


INFO:tensorflow:Assets written to: ram://882fe8da-6e30-4b53-bccb-d745ae60b6ea/assets
[I 2025-10-06 14:09:36,677] Trial 32 finished with value: 0.71703125 and parameters: {'min_count': 2, 'num_words': 23764, 'max_len': 18, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (4358, 50)
Words found in pretrained embeddings: 4356/4358 (99.95%)




INFO:tensorflow:Assets written to: ram://b0a5e8bf-5599-417e-b8fc-fb7c21e19314/assets


INFO:tensorflow:Assets written to: ram://b0a5e8bf-5599-417e-b8fc-fb7c21e19314/assets
[I 2025-10-06 14:11:57,584] Trial 33 finished with value: 0.7153125 and parameters: {'min_count': 4, 'num_words': 14106, 'max_len': 14, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (5493, 50)
Words found in pretrained embeddings: 5491/5493 (99.96%)




INFO:tensorflow:Assets written to: ram://cbc9603f-d7d3-44e3-b530-9277007596e8/assets


INFO:tensorflow:Assets written to: ram://cbc9603f-d7d3-44e3-b530-9277007596e8/assets
[I 2025-10-06 14:15:19,319] Trial 34 finished with value: 0.70921875 and parameters: {'min_count': 3, 'num_words': 28171, 'max_len': 27, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (8730, 50)
Words found in pretrained embeddings: 8728/8730 (99.98%)




INFO:tensorflow:Assets written to: ram://f8c16d72-9d5a-4594-acf1-f8f957ec4fc4/assets


INFO:tensorflow:Assets written to: ram://f8c16d72-9d5a-4594-acf1-f8f957ec4fc4/assets
[I 2025-10-06 14:16:52,525] Trial 35 finished with value: 0.68296875 and parameters: {'min_count': 2, 'num_words': 24679, 'max_len': 10, 'stemmer': 'WordNetLemmatizer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (4358, 50)
Words found in pretrained embeddings: 4356/4358 (99.95%)




INFO:tensorflow:Assets written to: ram://a62e86cc-d9eb-4017-a775-06d2bd22effa/assets


INFO:tensorflow:Assets written to: ram://a62e86cc-d9eb-4017-a775-06d2bd22effa/assets
[I 2025-10-06 14:19:58,312] Trial 36 finished with value: 0.7134375 and parameters: {'min_count': 4, 'num_words': 19325, 'max_len': 21, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (5493, 50)
Words found in pretrained embeddings: 5491/5493 (99.96%)




INFO:tensorflow:Assets written to: ram://4283e85c-0e42-4174-b3e0-fc891ee04837/assets


INFO:tensorflow:Assets written to: ram://4283e85c-0e42-4174-b3e0-fc891ee04837/assets
[I 2025-10-06 14:22:51,095] Trial 37 finished with value: 0.71125 and parameters: {'min_count': 3, 'num_words': 15872, 'max_len': 23, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (4031, 50)
Words found in pretrained embeddings: 4029/4031 (99.95%)




INFO:tensorflow:Assets written to: ram://ace99a52-7b7a-4bc3-81d5-870f558b991a/assets


INFO:tensorflow:Assets written to: ram://ace99a52-7b7a-4bc3-81d5-870f558b991a/assets
[I 2025-10-06 14:25:01,076] Trial 38 finished with value: 0.68375 and parameters: {'min_count': 4, 'num_words': 22354, 'max_len': 15, 'stemmer': 'LancasterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (3902, 50)
Words found in pretrained embeddings: 3900/3902 (99.95%)




INFO:tensorflow:Assets written to: ram://dd7f119f-ca03-4a51-b1c2-7bec45f80a3f/assets


INFO:tensorflow:Assets written to: ram://dd7f119f-ca03-4a51-b1c2-7bec45f80a3f/assets
[I 2025-10-06 14:26:51,832] Trial 39 finished with value: 0.6765625 and parameters: {'min_count': 5, 'num_words': 26181, 'max_len': 11, 'stemmer': 'WordNetLemmatizer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (2151, 50)
Words found in pretrained embeddings: 2149/2151 (99.91%)




INFO:tensorflow:Assets written to: ram://630f4dc7-eed9-4df2-9624-0d36d782d8f9/assets


INFO:tensorflow:Assets written to: ram://630f4dc7-eed9-4df2-9624-0d36d782d8f9/assets
[I 2025-10-06 14:30:16,644] Trial 40 finished with value: 0.7175 and parameters: {'min_count': 10, 'num_words': 49847, 'max_len': 30, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (7727, 50)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: ram://36eb9868-ba3b-4465-849f-7504b6370a2f/assets


INFO:tensorflow:Assets written to: ram://36eb9868-ba3b-4465-849f-7504b6370a2f/assets
[I 2025-10-06 14:33:17,841] Trial 41 finished with value: 0.71203125 and parameters: {'min_count': 2, 'num_words': 26232, 'max_len': 22, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (7727, 50)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: ram://baa65a37-efce-4d3d-b6d3-8a46ea44d01b/assets


INFO:tensorflow:Assets written to: ram://baa65a37-efce-4d3d-b6d3-8a46ea44d01b/assets
[I 2025-10-06 14:34:46,256] Trial 42 finished with value: 0.70265625 and parameters: {'min_count': 2, 'num_words': 29910, 'max_len': 8, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (18184, 50)
Words found in pretrained embeddings: 18182/18184 (99.99%)




INFO:tensorflow:Assets written to: ram://074a1546-a4b5-43df-82e1-0c6f47adbdac/assets


INFO:tensorflow:Assets written to: ram://074a1546-a4b5-43df-82e1-0c6f47adbdac/assets
[I 2025-10-06 14:37:42,289] Trial 43 finished with value: 0.708125 and parameters: {'min_count': 1, 'num_words': 17395, 'max_len': 21, 'stemmer': 'PorterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (4959, 50)
Words found in pretrained embeddings: 4957/4959 (99.96%)




INFO:tensorflow:Assets written to: ram://f0cd49dc-1862-49d0-b4df-affa2e1cbf4b/assets


INFO:tensorflow:Assets written to: ram://f0cd49dc-1862-49d0-b4df-affa2e1cbf4b/assets
[I 2025-10-06 14:40:04,476] Trial 44 finished with value: 0.69078125 and parameters: {'min_count': 3, 'num_words': 35471, 'max_len': 18, 'stemmer': 'LancasterStemmer'}. Best is trial 29 with value: 0.7221875.


Embedding matrix shape: (18184, 50)
Words found in pretrained embeddings: 18182/18184 (99.99%)




INFO:tensorflow:Assets written to: ram://22170f28-5024-41a2-9dee-55bab6a16193/assets


INFO:tensorflow:Assets written to: ram://22170f28-5024-41a2-9dee-55bab6a16193/assets
[I 2025-10-06 14:43:04,125] Trial 45 finished with value: 0.72671875 and parameters: {'min_count': 1, 'num_words': 27395, 'max_len': 24, 'stemmer': 'PorterStemmer'}. Best is trial 45 with value: 0.72671875.


Embedding matrix shape: (21117, 50)
Words found in pretrained embeddings: 21115/21117 (99.99%)




INFO:tensorflow:Assets written to: ram://990c7e45-4d02-41aa-ad83-af11bd7a8213/assets


INFO:tensorflow:Assets written to: ram://990c7e45-4d02-41aa-ad83-af11bd7a8213/assets
[I 2025-10-06 14:46:00,845] Trial 46 finished with value: 0.6853125 and parameters: {'min_count': 1, 'num_words': 22585, 'max_len': 24, 'stemmer': 'WordNetLemmatizer'}. Best is trial 45 with value: 0.72671875.


Embedding matrix shape: (18184, 50)
Words found in pretrained embeddings: 18182/18184 (99.99%)




INFO:tensorflow:Assets written to: ram://021f5bec-54eb-4068-a6b8-f671335eae04/assets


INFO:tensorflow:Assets written to: ram://021f5bec-54eb-4068-a6b8-f671335eae04/assets
[I 2025-10-06 14:49:06,432] Trial 47 finished with value: 0.71828125 and parameters: {'min_count': 1, 'num_words': 25376, 'max_len': 26, 'stemmer': 'PorterStemmer'}. Best is trial 45 with value: 0.72671875.


Embedding matrix shape: (2817, 50)
Words found in pretrained embeddings: 2815/2817 (99.93%)




INFO:tensorflow:Assets written to: ram://3eb3c6f5-b31a-49b0-a082-b18efbce855b/assets


INFO:tensorflow:Assets written to: ram://3eb3c6f5-b31a-49b0-a082-b18efbce855b/assets
[I 2025-10-06 14:52:50,153] Trial 48 finished with value: 0.718125 and parameters: {'min_count': 7, 'num_words': 9070, 'max_len': 29, 'stemmer': 'PorterStemmer'}. Best is trial 45 with value: 0.72671875.


Embedding matrix shape: (15701, 50)
Words found in pretrained embeddings: 15699/15701 (99.99%)




INFO:tensorflow:Assets written to: ram://91ba722e-3619-470f-a02d-9334bb029090/assets


INFO:tensorflow:Assets written to: ram://91ba722e-3619-470f-a02d-9334bb029090/assets
[I 2025-10-06 14:56:15,056] Trial 49 finished with value: 0.68671875 and parameters: {'min_count': 1, 'num_words': 20160, 'max_len': 27, 'stemmer': 'LancasterStemmer'}. Best is trial 45 with value: 0.72671875.


Optimization completed.


## Extraction meilleur modèle 

In [16]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_preprocessin").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_pre"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Best run ID: ec4aa4d9ebd448c192219024711a0f0e with metrics:
Accuracy: 0.72671875
F1_negatif: 0.7226011102299762
F1_positif: 0.730715935334873
Precision_negatif: 0.7336553945249598
Precision_positif: 0.7201820940819423
Recall_negatif: 0.711875
Recall_positif: 0.7415625
ROC_AUC: 0.8004910644531249
Best run parameters:
data_numrows: 32000
data_size: 0.02
epochs: 50
latent_dim: 50
learning_rate: 0.001
max_len: 24
min_count: 1
num_words: 27395
rnn_size: 64
stemmer: PorterStemmer


Registered model 'simple_rnn_best_pre' already exists. Creating a new version of this model...
Created version '3' of model 'simple_rnn_best_pre'.


Setting tag data_numrows = 32000 in registered model
Setting tag data_size = 0.02 in registered model
Setting tag epochs = 50 in registered model
Setting tag latent_dim = 50 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 24 in registered model
Setting tag min_count = 1 in registered model
Setting tag num_words = 27395 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = PorterStemmer in registered model


# Experimentation sur les embeddings (custom)


## Préparation

In [28]:
from gensim.models import FastText, Word2Vec

embedding_dict = {'Word2Vec':Word2Vec, 
                  'FastText':FastText}

## Core fonction

In [29]:
#Fonction à optimiser pour optuna

## Prétraitement
min_count = 2
num_words = 30000
max_len   = 30
stemmer   = 'PorterStemmer'
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=None, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=PorterStemmer().stem,
                                                        tokenizer=tokenizer, 
                                                        stop_words=stopwords.words('english'), 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

def embedding_eval_custom_embed(trial):
    # Hyperparamètres

    ## Embedding
    embedding_model = trial.suggest_categorical('embedding_model',list(embedding_dict.keys()))
    latent_dim = trial.suggest_int('latent_dim', 30, 150)
    window = trial.suggest_int("window", 2, 10)
    sg = trial.suggest_int('sg',0,1)

    ## Modèle
    rnn_size = 64
    ## Entrainement
    epochs = 50
    lr = 1e-3
    ## Savepath des poids du modèle
    model_savepath = "./Models/baselineRNN_pre.h5"



    with mlflow.start_run(nested=True):
        mlflow.log_input(dataset)
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': stemmer, 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'embedding_model':embedding_model,
            'sg':sg,
            'window':window,
            'epochs': epochs, 
            'learning_rate': lr, 
            "data_size": data_size,
            "data_numrows": data_numrows,
        })


        # Embedding(custom)
        if embedding_model=='Word2Vec':
            embedding_model = Word2Vec(
                sentences=sentences_train, 
                vector_size=latent_dim,  # dimension de l’espace latent
                window=5,         # taille du contexte
                min_count=min_count,      # ignorer les mots trop rares
            workers=4,        # parallélisme CPU
            sg=sg,              # 1 = skip-gram, 0 = CBOW
            epochs=30
            )
        elif embedding_model=='FastText':
            embedding_model = FastText(
                sentences=sentences_train, 
                vector_size=latent_dim, 
                window=5, 
                min_count=min_count,
                workers=4,
                sg=sg,
                epochs=30
                )


        model_vectors = embedding_model.wv
        w2v_words = model_vectors.index_to_key

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        
        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

        # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        acc = output_dict["Accuracy"]
    return acc

## Definition de l'experiment MLFlow/Optuna



In [30]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("optuna_word_embedding_experiment_custom_embedding")
exp_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_custom_embedding").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on évalue l'impact du type d'embedding custom et de la dimension de l'espace latent sur un modèle avec simpleRNN"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-preprocessing",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

## Lancement de l'optimisation

In [31]:
# Lancement de l'optimisation avec Optuna
print("Starting optimization trials...")
with mlflow.start_run(run_name="optuna_word_embedding_experiment_custom_embedding"):
    study = optuna.create_study(direction="maximize")
    study.optimize(embedding_eval_custom_embed, n_trials=30)
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_accuracy", study.best_value)

print("Optimization completed.")

[I 2025-10-06 17:41:40,435] A new study created in memory with name: no-name-dd804780-6d92-4aab-bb12-a75613882c2e


Starting optimization trials...
Embedding matrix shape: (7727, 84)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpwxmun93g\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpwxmun93g\model\data\model\assets
[I 2025-10-06 17:44:57,006] Trial 0 finished with value: 0.72015625 and parameters: {'embedding_model': 'FastText', 'latent_dim': 84, 'window': 10, 'sg': 1}. Best is trial 0 with value: 0.72015625.


Embedding matrix shape: (7727, 52)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp3gq5xsa0\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp3gq5xsa0\model\data\model\assets
[I 2025-10-06 17:49:03,052] Trial 1 finished with value: 0.7125 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 52, 'window': 5, 'sg': 0}. Best is trial 0 with value: 0.72015625.


Embedding matrix shape: (7727, 111)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpyru126yg\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpyru126yg\model\data\model\assets
[I 2025-10-06 17:52:54,648] Trial 2 finished with value: 0.72046875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 111, 'window': 2, 'sg': 1}. Best is trial 2 with value: 0.72046875.


Embedding matrix shape: (7727, 71)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp2o0edzhp\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp2o0edzhp\model\data\model\assets
[I 2025-10-06 17:56:33,273] Trial 3 finished with value: 0.71984375 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 71, 'window': 7, 'sg': 1}. Best is trial 2 with value: 0.72046875.


Embedding matrix shape: (7727, 44)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp83c39j8n\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp83c39j8n\model\data\model\assets
[I 2025-10-06 17:59:36,349] Trial 4 finished with value: 0.7090625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 44, 'window': 9, 'sg': 1}. Best is trial 2 with value: 0.72046875.


Embedding matrix shape: (7727, 110)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpj56b5ulp\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpj56b5ulp\model\data\model\assets
[I 2025-10-06 18:02:56,803] Trial 5 finished with value: 0.7134375 and parameters: {'embedding_model': 'FastText', 'latent_dim': 110, 'window': 6, 'sg': 0}. Best is trial 2 with value: 0.72046875.


Embedding matrix shape: (7727, 94)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpnyk6utt0\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpnyk6utt0\model\data\model\assets
[I 2025-10-06 18:06:06,165] Trial 6 finished with value: 0.72640625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 94, 'window': 9, 'sg': 1}. Best is trial 6 with value: 0.72640625.


Embedding matrix shape: (7727, 103)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpqhpt4enf\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpqhpt4enf\model\data\model\assets
[I 2025-10-06 18:09:22,853] Trial 7 finished with value: 0.7278125 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 103, 'window': 2, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 49)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpx85c1xz_\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpx85c1xz_\model\data\model\assets
[I 2025-10-06 18:12:42,786] Trial 8 finished with value: 0.710625 and parameters: {'embedding_model': 'FastText', 'latent_dim': 49, 'window': 2, 'sg': 0}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 56)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpxg5nci4f\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpxg5nci4f\model\data\model\assets
[I 2025-10-06 18:16:05,463] Trial 9 finished with value: 0.71296875 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 56, 'window': 6, 'sg': 0}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 146)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpnyzzzfen\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpnyzzzfen\model\data\model\assets
[I 2025-10-06 18:18:37,306] Trial 10 finished with value: 0.72234375 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 146, 'window': 4, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 109)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpje15zznh\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpje15zznh\model\data\model\assets
[I 2025-10-06 18:21:23,701] Trial 11 finished with value: 0.72140625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 109, 'window': 8, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 134)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpq42usbvs\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpq42usbvs\model\data\model\assets
[I 2025-10-06 18:24:14,073] Trial 12 finished with value: 0.72234375 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 134, 'window': 4, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 93)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpftocrkh8\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpftocrkh8\model\data\model\assets
[I 2025-10-06 18:27:11,524] Trial 13 finished with value: 0.7225 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 93, 'window': 10, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 92)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmphs4t3uc7\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmphs4t3uc7\model\data\model\assets
[I 2025-10-06 18:30:28,821] Trial 14 finished with value: 0.7125 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 92, 'window': 8, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 125)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmps0yaiji2\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmps0yaiji2\model\data\model\assets
[I 2025-10-06 18:33:29,830] Trial 15 finished with value: 0.71953125 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 125, 'window': 3, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 31)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpkqdzdolo\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpkqdzdolo\model\data\model\assets
[I 2025-10-06 18:36:38,857] Trial 16 finished with value: 0.7140625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 31, 'window': 8, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 75)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpjcro7ccm\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpjcro7ccm\model\data\model\assets
[I 2025-10-06 18:39:47,801] Trial 17 finished with value: 0.7215625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 75, 'window': 5, 'sg': 0}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 99)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp9_3qubdz\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp9_3qubdz\model\data\model\assets
[I 2025-10-06 18:42:32,203] Trial 18 finished with value: 0.71546875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 99, 'window': 7, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 129)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgbs5a9fy\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgbs5a9fy\model\data\model\assets
[I 2025-10-06 18:45:13,294] Trial 19 finished with value: 0.72625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 129, 'window': 3, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 68)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp4lke1yah\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp4lke1yah\model\data\model\assets
[I 2025-10-06 18:47:41,867] Trial 20 finished with value: 0.72015625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 68, 'window': 9, 'sg': 0}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 129)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmplnmq8wxa\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmplnmq8wxa\model\data\model\assets
[I 2025-10-06 18:50:12,804] Trial 21 finished with value: 0.72671875 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 129, 'window': 3, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 122)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpshzzx5dr\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpshzzx5dr\model\data\model\assets
[I 2025-10-06 18:52:40,006] Trial 22 finished with value: 0.725 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 122, 'window': 3, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 147)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0xqg6vf7\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0xqg6vf7\model\data\model\assets
[I 2025-10-06 18:55:11,201] Trial 23 finished with value: 0.7215625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 147, 'window': 2, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 102)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgfypws1z\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgfypws1z\model\data\model\assets
[I 2025-10-06 18:58:03,395] Trial 24 finished with value: 0.724375 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 102, 'window': 4, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 117)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpdc1ar50f\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpdc1ar50f\model\data\model\assets
[I 2025-10-06 19:00:39,752] Trial 25 finished with value: 0.72265625 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 117, 'window': 5, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 137)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgofzba4a\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgofzba4a\model\data\model\assets
[I 2025-10-06 19:03:24,520] Trial 26 finished with value: 0.72421875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 137, 'window': 3, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 82)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpzbyy2kj9\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpzbyy2kj9\model\data\model\assets
[I 2025-10-06 19:06:01,129] Trial 27 finished with value: 0.71375 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 82, 'window': 2, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 104)
Words found in pretrained embeddings: 7725/7727 (99.97%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpeib1buk1\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpeib1buk1\model\data\model\assets
[I 2025-10-06 19:08:42,174] Trial 28 finished with value: 0.7203125 and parameters: {'embedding_model': 'Word2Vec', 'latent_dim': 104, 'window': 7, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Embedding matrix shape: (7727, 85)
Words found in pretrained embeddings: 7726/7727 (99.99%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpo34oz9_u\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpo34oz9_u\model\data\model\assets
[I 2025-10-06 19:11:37,400] Trial 29 finished with value: 0.7196875 and parameters: {'embedding_model': 'FastText', 'latent_dim': 85, 'window': 10, 'sg': 1}. Best is trial 7 with value: 0.7278125.


Optimization completed.


In [32]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("optuna_word_embedding_experiment_custom_embedding").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_custom_embed"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Best run ID: 95c82edbe4b44614969cdb9308b9e4ee with metrics:
Accuracy: 0.7278125
F1_negatif: 0.7193038994521431
F1_positif: 0.7358204428268122
Precision_negatif: 0.7425149700598802
Precision_positif: 0.7147908073070124
Recall_negatif: 0.6975
Recall_positif: 0.758125
ROC_AUC: 0.8050172851562499
Best run parameters:
data_numrows: 32000
data_size: 0.02
embedding_model: Word2Vec
epochs: 50
latent_dim: 103
learning_rate: 0.001
max_len: 30
min_count: 2
num_words: 30000
rnn_size: 64
sg: 1
stemmer: PorterStemmer
window: 2


Registered model 'simple_rnn_best_custom_embed' already exists. Creating a new version of this model...
Created version '2' of model 'simple_rnn_best_custom_embed'.


Setting tag data_numrows = 32000 in registered model
Setting tag data_size = 0.02 in registered model
Setting tag embedding_model = Word2Vec in registered model
Setting tag epochs = 50 in registered model
Setting tag latent_dim = 103 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 30 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 30000 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag sg = 1 in registered model
Setting tag stemmer = PorterStemmer in registered model
Setting tag window = 2 in registered model


# Experimentation sur les embeddings (préentrainés)

Les embeddings pré-entrainés ont été entrainés sur un très grand nombre de tweets et prennent donc en compte un très grand nombre de situations. On peut donc se passer de la phase de stemming et garder les stopwords. 


## Préparation

In [33]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Word2Vec Google News
w2v_google = api.load("word2vec-google-news-300")  # KeyedVectors

# FastText wiki-news subwords-300
ft_wiki = api.load("fasttext-wiki-news-subwords-300")

# GloVe Twitter (par exemple 200 dimensions)
glove_tw200 = api.load("glove-twitter-200")
glove_tw100 = api.load("glove-twitter-100")
glove_tw50  = api.load("glove-twitter-50")
glove_tw25  = api.load("glove-twitter-25")



In [34]:
embedding_dict = {'Word2Vec_google':w2v_google, 
                  'FastText_wiki':ft_wiki,
                  'Glove_twitter_200':glove_tw200, 
                  'Glove_twitter_100':glove_tw100, 
                  'Glove_twitter_50':glove_tw50, 
                  'Glove_twitter_25':glove_tw25
                  }

In [35]:
## Prétraitement
min_count = 2
num_words = 30000
max_len   = 30
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 

## Fonction de base 

In [36]:
# Hyperparamètres

## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle
model_savepath = "./Models/baselineRNN_pretrained_embed.h5"

In [37]:
def pretrained_embed_experiment(embedding_name):
     with mlflow.start_run():
        mlflow.log_input(dataset)
        latent_dim = embedding_dict[embedding_name].vector_size
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':embedding_name,
            "data_size": data_size,
            "data_numrows": data_numrows,
        })


        model_vectors = embedding_dict[embedding_name]

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                          embedding_model=model_vectors, 
                                          latent_dim=latent_dim
                                          )

        # Modèle
        model =  build_base_RNN(vocab_size=vocab_size, 
                        latent_dim=latent_dim,
                        input_length=max_len, 
                        embedding_matrix=embedding_matrix,
                        rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Définition de l'experiment dans MLFlow

In [38]:
# Création de l'étude Optuna et optimisation
print("Starting hyperparameter optimization with Optuna...")
print("Setting up MLflow experiment...")
mlflow.set_experiment("word_embedding_experiment_pretrained_embedding")
exp_id = mlflow.get_experiment_by_name("word_embedding_experiment_pretrained_embedding").experiment_id

experiment_description = (
    "Cette experience contient les différents tests pour le modèle RNN simple. "
    "Ici on teste plusieurs embeddings préentrainées sur de larges corpora, beaucoup d'attentes par rapport aux embeddings avec Glove entrainés sur des tweets"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "simple-RNN-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Starting hyperparameter optimization with Optuna...
Setting up MLflow experiment...


## Lancement de l'experiment

In [39]:
for embedding_name in list(embedding_dict.keys()):
    
    print(f"Running test with {embedding_name}")
    pretrained_embed_experiment(embedding_name)


Running test with Word2Vec_google
Embedding matrix shape: (9616, 300)
Words found in pretrained embeddings: 8930/9616 (92.87%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0l012xn1\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp0l012xn1\model\data\model\assets


Running test with FastText_wiki
Embedding matrix shape: (9616, 300)
Words found in pretrained embeddings: 9173/9616 (95.39%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpfe_a6jft\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpfe_a6jft\model\data\model\assets


Running test with Glove_twitter_200
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpmzrdx76b\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpmzrdx76b\model\data\model\assets


Running test with Glove_twitter_100
Embedding matrix shape: (9616, 100)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgq0orbrf\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpgq0orbrf\model\data\model\assets


Running test with Glove_twitter_50
Embedding matrix shape: (9616, 50)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmppy1fe1ys\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmppy1fe1ys\model\data\model\assets


Running test with Glove_twitter_25
Embedding matrix shape: (9616, 25)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpv3sbe_od\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpv3sbe_od\model\data\model\assets


## Enregistrement du meilleur modèle

In [40]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("word_embedding_experiment_pretrained_embedding").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "simple_rnn_best_pre"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Best run ID: ce06b66ec4ff48e6a994d2485c63701f with metrics:
Accuracy: 0.795875
F1_negatif: 0.7963459499906467
F1_positif: 0.7954018668170143
Precision_negatif: 0.7945128779395296
Precision_positif: 0.7972497802335803
Recall_negatif: 0.7981875
Recall_positif: 0.7935625
ROC_AUC: 0.874631751953125
Best run parameters:
embedding_name: Glove_twitter_200
epochs: 50
latent_dim: 200
learning_rate: 0.001
max_len: 30
min_count: 2
num_words: 20000
rnn_size: 64
stemmer: None


Registered model 'simple_rnn_best_pre' already exists. Creating a new version of this model...
Created version '4' of model 'simple_rnn_best_pre'.


Setting tag embedding_name = Glove_twitter_200 in registered model
Setting tag epochs = 50 in registered model
Setting tag latent_dim = 200 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 30 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 20000 in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = None in registered model


# Comparaison des différentes couches de notre réseau de neurones : SimpleRNN vs GRU vs LSTM

## Préparation 

On reprend les paramètres d'embedding de la meilleure run sur embeddings customs et embeddings préentrainés. 

In [41]:
## Prétraitement
min_count = 2
num_words = 20000
max_len   = 30
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


model_vectors = glove_tw200
latent_dim = glove_tw200.vector_size

embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )

Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)


In [42]:
from Source.preprocess_data import *
rnn_layer_name_list = ['SimpleRNN','GRU','LSTM']


## Fonction de base

In [43]:
## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle

def rnn_layer_experiment(rnn_layer_name):
     with mlflow.start_run():
        mlflow.log_input(dataset)

        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':rnn_layer_name,
            "data_size": data_size,
            "data_numrows": data_numrows,
        })
        model_savepath = "./Models/"+rnn_layer_name+"_model_exp.h5"
        # Modèle
        if rnn_layer_name == 'SimpleRNN':
            model =  build_base_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name == 'GRU':
            model = build_gru_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name=='LSTM':
            model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Definition de l'experiment dans MLFLow

In [44]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("rnn_layer_experiment_pretrained_embedding")
exp_id = mlflow.get_experiment_by_name("rnn_layer_experiment_pretrained_embedding").experiment_id

experiment_description = (
    "Comparaison des impact des types de cellules RNN utilisées : SimpleRNN, GRU et LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "RNN_types-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Setting up MLflow experiment...


## Lancement de l'expériment

In [45]:
for rnn_layer_name in rnn_layer_name_list:
    
    print(f"Running test with {rnn_layer_name}")
    rnn_layer_experiment(rnn_layer_name)


Running test with SimpleRNN




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmphoourp89\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmphoourp89\model\data\model\assets


Running test with GRU




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpobygs7eb\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpobygs7eb\model\data\model\assets


Running test with LSTM




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpy3apnxxi\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpy3apnxxi\model\data\model\assets


# Retour sur la longueur des séquences     

Ici on va revenir sur la longueur des séquences utilisées car LSTM permet de garder des séquences plus longues sans pour autant avoir d'évanescence de gradient. 

## Fonction de base 

In [46]:


## Modèle
rnn_size = 64
## Entrainement
epochs = 50
lr = 1e-3
## Savepath des poids du modèle

def lstm_maxlen_experiment(max_len):
     with mlflow.start_run():
        latent_dim = glove_tw200.vector_size
        mlflow.log_input(dataset)
        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':'LSTM',
            "data_size": data_size,
            "data_numrows": data_numrows,
        })

        ## Prétraitement

        # Prétraitement
        X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
        X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


        model_vectors = glove_tw200

        embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )
        model_savepath = f"./Models/{rnn_layer_name}_model_exp_len{max_len}.h5"
        # Modèle

        model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=0)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

## Experiment MLFLow



In [47]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("lstm_maxlen_experiment")
exp_id = mlflow.get_experiment_by_name("lstm_maxlen_experiment").experiment_id

experiment_description = (
    "Comparaison des impact des types de cellules RNN utilisées : SimpleRNN, GRU et LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "LSTM_pretrained_embedding",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Setting up MLflow experiment...


## Lancement de l'experiment 

In [48]:
for max_len in list(range(30,100,5)):
    
    print(f"Running test with sequence length of {max_len} tokens")
    lstm_maxlen_experiment(max_len)


Running test with sequence length of 30 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmprbce8r0f\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmprbce8r0f\model\data\model\assets


Running test with sequence length of 35 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpqzjtp3nh\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpqzjtp3nh\model\data\model\assets


Running test with sequence length of 40 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpuvvczdl0\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpuvvczdl0\model\data\model\assets


Running test with sequence length of 45 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp82smw54e\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp82smw54e\model\data\model\assets


Running test with sequence length of 50 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp7pc90lw4\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp7pc90lw4\model\data\model\assets


Running test with sequence length of 55 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp7q0nw3h4\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp7q0nw3h4\model\data\model\assets


Running test with sequence length of 60 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpr194bjsa\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpr194bjsa\model\data\model\assets


Running test with sequence length of 65 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpm2tl1wyw\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpm2tl1wyw\model\data\model\assets


Running test with sequence length of 70 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp604ozebu\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp604ozebu\model\data\model\assets


Running test with sequence length of 75 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp_nacz27e\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp_nacz27e\model\data\model\assets


Running test with sequence length of 80 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmprcbsl3pq\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmprcbsl3pq\model\data\model\assets


Running test with sequence length of 85 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpl6hwf5_3\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpl6hwf5_3\model\data\model\assets


Running test with sequence length of 90 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpij4f0m5k\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpij4f0m5k\model\data\model\assets


Running test with sequence length of 95 tokens
Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp1756tp5f\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmp1756tp5f\model\data\model\assets


## Enregistrement du mailleur modèle

In [49]:
client = MlflowClient(tracking_uri="http://localhost:8080")
experiment_id = mlflow.get_experiment_by_name("lstm_maxlen_experiment").experiment_id
runs = client.search_runs(experiment_id)

# Métrique pour sélectionner le meilleur modèle
metric_to_optimize = "Accuracy" # liste des métriques enregistrées dans postprocess_data.py ou sur l'UI MLflow
best_run = max(runs, key=lambda run: run.data.metrics.get(metric_to_optimize, float('-inf')))
print(f"Best run ID: {best_run.info.run_id} with metrics:")
for key, value in best_run.data.metrics.items():
    print(f"{key}: {value}")
print(f"Best run parameters:")
for key, value in best_run.data.params.items():
    print(f"{key}: {value}")

# Enregistrement du meilleur modèle
best_model_uri = f"runs:/{best_run.info.run_id}/model"
registered_model_name = "lstm_maxlen_best"
registered_model = mlflow.register_model(best_model_uri, registered_model_name)
# Enregistrement des paramètres sous forme de tags dans le modèle enregistré
for key, value in best_run.data.params.items():
    print(f"Setting tag {key} = {value} in registered model")
    client.set_model_version_tag(
        name=registered_model_name,
        version=str(registered_model.version),
        key=str(key),
        value=str(value))

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Best run ID: b63acc4f64914a5fb77ab6b26a974e13 with metrics:
Accuracy: 0.81365625
F1_negatif: 0.811518159117489
F1_positif: 0.8157463770355035
Precision_negatif: 0.8209375199846518
Precision_positif: 0.8066980382570433
Recall_negatif: 0.8023125
Recall_positif: 0.825
ROC_AUC: 0.89485159765625
Best run parameters:
embedding_name: Glove_twitter_200
epochs: 50
latent_dim: 200
learning_rate: 0.001
max_len: 50
min_count: 2
num_words: 20000
rnn_layer_name: LSTM
rnn_size: 64
stemmer: None


Registered model 'lstm_maxlen_best' already exists. Creating a new version of this model...
Created version '2' of model 'lstm_maxlen_best'.


Setting tag embedding_name = Glove_twitter_200 in registered model
Setting tag epochs = 50 in registered model
Setting tag latent_dim = 200 in registered model
Setting tag learning_rate = 0.001 in registered model
Setting tag max_len = 50 in registered model
Setting tag min_count = 2 in registered model
Setting tag num_words = 20000 in registered model
Setting tag rnn_layer_name = LSTM in registered model
Setting tag rnn_size = 64 in registered model
Setting tag stemmer = None in registered model


Quand on regarde les améliorations obtenues en augmentant la longueur des séquences, il n'est pas réellement pertinent d'augmenter la longueur des séquences au delà de 50 tokens. 