In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import TextVectorization, Embedding, SimpleRNN, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
import gensim.downloader as gensim_downloader
import gensim
import multiprocessing
from mlflow import MlflowClient
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import optuna

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix 
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer, SnowballStemmer
from tqdm import tqdm
tqdm.pandas()


import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"


import sys
from pathlib import Path
cwd = Path.cwd()
parent = cwd.parent
sys.path.append(str(parent))


from Source.preprocess_data import *  ## import all functions from preprocess_data.py
from Source.postprocess_data import * ## import all functions from postprocess_data.py
from Source.utils import *  ## import all functions from utils.py
import nltk


client = MlflowClient(tracking_uri="http://localhost:8080")

mlruns_path = Path("../mlruns").resolve() 
mlflow_uri = mlruns_path.as_uri()
mlflow.set_tracking_uri(mlflow_uri)

nw = multiprocessing.cpu_count()



os.environ["TF_KERAS"]='1'
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs disponibles :", tf.config.list_physical_devices("GPU"))
print("Version TF :", tf.__version__)

  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import parse_version


2.10.1
Num GPUs Available:  1
GPUs disponibles : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Version TF : 2.10.1


In [2]:
df = pd.read_csv('https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip',
                header=None,
                compression='zip',
                encoding='cp1252')

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
data_size = 0.02
sample_df, _ = train_test_split(df, test_size=1-data_size, random_state=42, stratify=df['target'])
sample_df = sample_df.reset_index(drop=True)
print(f"Sample size: {sample_df.shape[0]} rows")
data_numrows = sample_df.shape[0]
# On ne garde que les colonnes 'target' et 'text'
sample_df = sample_df[['target', 'text']]
sample_df["target"] = sample_df["target"].apply(lambda x: 0 if x == 0 else 1)
sample_df.to_csv('../Data/raw_data.csv', index=False)
dataset = mlflow.data.from_pandas(
    sample_df,
    source="../Data/raw_data.csv",
    name="dataset_v1"
)


Sample size: 32000 rows


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [3]:
# Data
X_raw = sample_df['text']
y = sample_df['target']
X_train, X_val, y_train, y_val = train_test_split(X_raw, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# GloVe Twitter (par exemple 200 dimensions)
glove_tw200 = api.load("glove-twitter-200")


In [5]:
## Prétraitement
min_count = 2
num_words = 30000
max_len   = 50
# Prétraitement
X_sentence_train, tokenizer, sentences_train = preprocess_data_embedding(X_raw=X_train, 
                                                        stem_lem_func=None,
                                                        tokenizer=None, 
                                                        stop_words=None, 
                                                        min_count=min_count,
                                                        max_len = max_len, 
                                                        num_words=num_words, 
                                                        return_sentences=True) 
X_sentence_val = preprocess_data_embedding(X_raw=X_val, 
                                                        stem_lem_func=None,
                                                        tokenizer=tokenizer, 
                                                        stop_words=None, 
                                                        min_count=1, # mincount = 1 car on est sur le jeu de validation
                                                        max_len = max_len, 
                                                        num_words=num_words) 


model_vectors = glove_tw200
latent_dim = glove_tw200.vector_size

embedding_matrix, vocab_size = build_embedding_matrix(tokenizer=tokenizer,
                                embedding_model=model_vectors, 
                                latent_dim=latent_dim
                              )

Embedding matrix shape: (9616, 200)
Words found in pretrained embeddings: 9452/9616 (98.29%)


In [9]:
## Modèle
rnn_size = 128
## Entrainement
epochs = 50
lr = 1e-3
max_len = 50
## Savepath des poids du modèle

def rnn_layer_experiment_bi(rnn_layer_name):
     with mlflow.start_run():
        mlflow.log_input(dataset)

        mlflow.log_params(params={
            'num_words':num_words,               
            'max_len': max_len,
            'min_count': min_count,
            'stemmer': 'None', 
            'latent_dim': latent_dim, 
            'rnn_size': rnn_size, 
            'epochs': epochs, 
            'learning_rate': lr,
            'embedding_name':'Glove_twitter_200',
            'rnn_layer_name':rnn_layer_name,
            "data_size": data_size,
            "data_numrows": data_numrows, 
        })
        model_savepath = "./Models/"+rnn_layer_name+"_model_exp.h5"
        # Modèle

        if rnn_layer_name=='LSTM':
            model = build_lstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)
        elif rnn_layer_name=='Bi-LSTM':
            model = build_bilstm_RNN(vocab_size=vocab_size, 
                            latent_dim=latent_dim,
                            input_length=max_len, 
                            embedding_matrix=embedding_matrix,
                            rnn_size = rnn_size)

        ## Callbacks
        checkpoint = ModelCheckpoint(model_savepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min')
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, min_lr=1e-5)
        callbacks_list = [checkpoint, es, lr_scheduler]
        ## Compilation
        optimizer = Adam(learning_rate=lr)
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

        #Entrainement
        with tf.device("/GPU:0"):
            history = model.fit(X_sentence_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_sentence_val,y_val), callbacks=callbacks_list, verbose=1)

        model.load_weights(model_savepath)

                # Prédictions sur le jeu de validation
        y_pred_proba = model.predict(X_sentence_val)
        y_pred = (y_pred_proba>0.5)


        output_dict = postprocess_model_output(y_val, y_pred, y_pred_proba) # voir postprocess_data.py

        # Logging des métriques dans MLflow
        mlflow.log_metrics(output_dict)
        # Matrice de confusion
        cm = confusion_matrix(y_val, y_pred, normalize='pred')
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt=".2f", cmap="Blues", ax=ax, )
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title("Confusion Matrix - Validation Set")
        fig.savefig("confusion_matrix.png")
        plt.close(fig)
        mlflow.log_artifact("confusion_matrix.png")
        #
        fig2 = plot_training_history(history,show=False)
        fig2.savefig("learning_path.png")
        plt.close(fig2)
        mlflow.log_artifact("learning_path.png")

        # Enregistrement du modèle dans MLflow
        mlflow.tensorflow.log_model(model, "model")
        

In [10]:
# Création de l'étude Optuna et optimisation
print("Setting up MLflow experiment...")
mlflow.set_experiment("rnn_layer_experiment_pretrained_embedding")
exp_id = mlflow.get_experiment_by_name("rnn_layer_experiment_pretrained_embedding").experiment_id

experiment_description = (
    "Comparaison des impact des types de cellules RNN utilisées : SimpleRNN, GRU et LSTM "
    ""
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "Sentiment analysis modelling",
    "model_type": "RNN_types-pretrained-embeddings",
    "team": "Ph. Constant",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

for key, value in experiment_tags.items():
    client.set_experiment_tag(exp_id, key, value)

Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\ProgramData\anaconda3\envs\AI_env_P7_gpu\lib\site-packages\mlflow\utils\yaml_utils.py", line 107, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' d

Setting up MLflow experiment...


In [11]:

print(f"Running test with Bi-LSTM")
rnn_layer_experiment_bi("Bi-LSTM")
rnn_layer_experiment_bi("LSTM")



Running test with Bi-LSTM












Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 200)           1923200   
                                                                 
 bidirectional_1 (Bidirectio  (None, 50, 256)          336896    
 nal)                                                            
                                                                 
 global_max_pooling1d_2 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 64)                16448     
                                                                 
 dropout_3 (Dropout)         (None, 64)               



INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpsyvb0c0u\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpsyvb0c0u\model\data\model\assets


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50




INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpvxcfotlg\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\bassm\AppData\Local\Temp\tmpvxcfotlg\model\data\model\assets
