# 1. Importation des Bibliothèques

In [2]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, GlobalMaxPooling1D, Dropout
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.keras





# 2. Chargement des Données

In [4]:
# Charger les données (en supposant que vous avez déjà les colonnes nettoyées)
data = pd.read_csv('../data/database_p7_rework.csv')

# Séparer les cibles (target) des textes
y = data['target']


# 3. Préparation des Données
Tokenization avec BERT

In [None]:
bert_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

def encode_with_bert(texts, max_len=128):
    return tokenizer(texts.tolist(), max_length=max_len, truncation=True, padding='max_length', return_tensors='tf')

# Encodez les textes lemmatisés et stemmés avec BERT
X_lemma_bert = encode_with_bert(data['text_lemmatized'])
X_stem_bert = encode_with_bert(data['text_stemmed'])

# Séparation des données en ensembles d'entraînement et de test
X_train_lemma_bert, X_test_lemma_bert, y_train, y_test = train_test_split(X_lemma_bert['input_ids'], y, test_size=0.2, random_state=42)
X_train_stem_bert, X_test_stem_bert, _, _ = train_test_split(X_stem_bert['input_ids'], y, test_size=0.2, random_state=42)




# 4. Construction des Modèles
Modèle CNN

In [None]:
def create_cnn_model_bert(input_shape):
    model = Sequential()
    model.add(Conv1D(128, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


Modèle LSTM

In [None]:
def create_lstm_model_bert(input_shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# 5. Entraînement des Modèles
5.1 Entraînement CNN avec BERT + Lemmatization

In [None]:
mlflow.end_run()  # Terminer toute exécution en cours

with mlflow.start_run():
    mlflow.log_param("preprocessing", "Lemmatization")
    mlflow.log_param("model_type", "CNN")

    cnn_model_lemma_bert = create_cnn_model_bert((128, 768))
    cnn_model_lemma_bert.fit(X_train_lemma_bert, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

    mlflow.keras.log_model(cnn_model_lemma_bert, "cnn_model_lemma_bert")


5.2 Entraînement LSTM avec BERT + Lemmatization

In [None]:
mlflow.end_run()  # Terminer toute exécution en cours

with mlflow.start_run():
    mlflow.log_param("preprocessing", "Lemmatization")
    mlflow.log_param("model_type", "LSTM")

    lstm_model_lemma_bert = create_lstm_model_bert((128, 768))
    lstm_model_lemma_bert.fit(X_train_lemma_bert, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

    mlflow.keras.log_model(lstm_model_lemma_bert, "lstm_model_lemma_bert")


5.3 Entraînement CNN avec BERT + Stemming

In [None]:
mlflow.end_run()  # Terminer toute exécution en cours

with mlflow.start_run():
    mlflow.log_param("preprocessing", "Stemming")
    mlflow.log_param("model_type", "CNN")

    cnn_model_stem_bert = create_cnn_model_bert((128, 768))
    cnn_model_stem_bert.fit(X_train_stem_bert, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

    mlflow.keras.log_model(cnn_model_stem_bert, "cnn_model_stem_bert")


5.4 Entraînement LSTM avec BERT + Stemming

In [None]:
mlflow.end_run()  # Terminer toute exécution en cours

with mlflow.start_run():
    mlflow.log_param("preprocessing", "Stemming")
    mlflow.log_param("model_type", "LSTM")

    lstm_model_stem_bert = create_lstm_model_bert((128, 768))
    lstm_model_stem_bert.fit(X_train_stem_bert, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

    mlflow.keras.log_model(lstm_model_stem_bert, "lstm_model_stem_bert")


# 6. Évaluation des Modèles
6.1 Évaluation du Modèle CNN avec BERT + Lemmatization

In [None]:
# Prédictions pour CNN + Lemmatization
y_pred_cnn_lemma = (cnn_model_lemma_bert.predict(X_test_lemma_bert) > 0.5).astype("int32")

# Calcul de l'accuracy
accuracy_cnn_lemma = accuracy_score(y_test, y_pred_cnn_lemma)
print("Accuracy CNN + Lemmatization:", accuracy_cnn_lemma)

# Rapport de classification
print("Classification Report CNN + Lemmatization:")
print(classification_report(y_test, y_pred_cnn_lemma))

# Matrice de confusion
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm_cnn_lemma = confusion_matrix(y_test, y_pred_cnn_lemma)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_cnn_lemma)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix CNN + Lemmatization")
plt.show()


6.2 Évaluation du Modèle LSTM avec BERT + Lemmatization

In [None]:
# Prédictions pour LSTM + Lemmatization
y_pred_lstm_lemma = (lstm_model_lemma_bert.predict(X_test_lemma_bert) > 0.5).astype("int32")

# Calcul de l'accuracy
accuracy_lstm_lemma = accuracy_score(y_test, y_pred_lstm_lemma)
print("Accuracy LSTM + Lemmatization:", accuracy_lstm_lemma)

# Rapport de classification
print("Classification Report LSTM + Lemmatization:")
print(classification_report(y_test, y_pred_lstm_lemma))

# Matrice de confusion
cm_lstm_lemma = confusion_matrix(y_test, y_pred_lstm_lemma)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lstm_lemma)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix LSTM + Lemmatization")
plt.show()


6.3 Évaluation du Modèle CNN avec BERT + Stemming

In [None]:
# Prédictions pour CNN + Stemming
y_pred_cnn_stem = (cnn_model_stem_bert.predict(X_test_stem_bert) > 0.5).astype("int32")

# Calcul de l'accuracy
accuracy_cnn_stem = accuracy_score(y_test, y_pred_cnn_stem)
print("Accuracy CNN + Stemming:", accuracy_cnn_stem)

# Rapport de classification
print("Classification Report CNN + Stemming:")
print(classification_report(y_test, y_pred_cnn_stem))

# Matrice de confusion
cm_cnn_stem = confusion_matrix(y_test, y_pred_cnn_stem)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_cnn_stem)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix CNN + Stemming")
plt.show()


6.4 Évaluation du Modèle LSTM avec BERT + Stemming

In [None]:
# Prédictions pour LSTM + Stemming
y_pred_lstm_stem = (lstm_model_stem_bert.predict(X_test_stem_bert) > 0.5).astype("int32")

# Calcul de l'accuracy
accuracy_lstm_stem = accuracy_score(y_test, y_pred_lstm_stem)
print("Accuracy LSTM + Stemming:", accuracy_lstm_stem)

# Rapport de classification
print("Classification Report LSTM + Stemming:")
print(classification_report(y_test, y_pred_lstm_stem))

# Matrice de confusion
cm_lstm_stem = confusion_matrix(y_test, y_pred_lstm_stem)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lstm_stem)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix LSTM + Stemming")
plt.show()
