# Importation des packages

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

# Import des modules Keras et sklearn
from tensorflow.keras.layers import TextVectorization, LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. Importation des données

Ajoutez un raccourci de ce dossier à votre google drive :

https://drive.google.com/drive/folders/1mx-CAzT10YKrmxHfYDP_1Oef7PVGUr7s?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
df = pd.read_csv('/Users/romain/Downloads/Classification/data_classification_commentaires_toxiques/train.csv')

# Affichage rapide du dataframe et d'un exemple de commentaire toxique
print(df)
print(df.loc[df.identity_hate == 1].iloc[0].comment_text)

# 2. PRÉTRAITEMENT : VECTORISATION DU TEXTE

In [None]:
x = df.comment_text
y = df[df.columns[2:]].values

MAX_FEATURES = 200000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=2000,
                               output_mode='int')
# Adapter le vectoriseur sur les données textuelles
vectorizer.adapt(x.values)
print("Taille du vocabulaire :", len(vectorizer.get_vocabulary()))
print("Exemple de vectorisation :", vectorizer('Hello world, life is great'))

# Conversion du texte en séquences vectorisées
vectorized_text = vectorizer(x.values)

# 3. CRÉATION DU DATASET TENSORFLOW

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

# Affichage d'un batch pour vérification
batch_x, batch_y = dataset.as_numpy_iterator().next()
print("Shape d'un batch (x):", batch_x.shape)
print("Shape d'un batch (y):", batch_y.shape)

# Division du dataset en ensembles d'entraînement, validation et test
total_batches = len(dataset)
train = dataset.take(int(total_batches * 0.7))
val = dataset.skip(int(total_batches * 0.7)).take(int(total_batches * 0.2))
test = dataset.skip(int(total_batches * 0.9)).take(int(total_batches * 0.1))

# 4. CONSTRUCTION ET ENTRAÎNEMENT DU MODÈLE

In [None]:
model = Sequential()

model.add(Embedding(MAX_FEATURES + 1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

model.summary()

model.compile(loss='BinaryCrossentropy', optimizer='Adam', metrics=['accuracy'])
history = model.fit(train, epochs=1, validation_data=val)

# 5. PRÉDICTIONS ET ÉVALUATIONS INITIALES

In [None]:
# Prédiction sur un exemple de texte
input_text = vectorizer('You freaking suck! I am going to kill you')
print("Prédiction (exemple 1) :", model.predict(np.array([input_text])))
print("Prédiction (exemple 2) :", model.predict(np.expand_dims(input_text, 0)))

# Evaluation sur un batch du test set
batch_x, batch_y = test.as_numpy_iterator().next()
print("Prédictions sur batch test :", (model.predict(batch_x) > 0.5).astype(int))
print("Labels réels :", batch_y)

# 6. CALCUL DES MÉTRIQUES AVEC KERAS

In [None]:
pre = Precision()
re = Recall()
for batch in test.as_numpy_iterator():
    batch_x, batch_y = batch
    yhat = model.predict(batch_x)
    # Aplatir les tableaux pour le calcul
    pre.update_state(batch_y.flatten(), yhat.flatten())
    re.update_state(batch_y.flatten(), yhat.flatten())

precision = pre.result().numpy()
recall = re.result().numpy()
f1score = (2 * precision * recall) / (precision + recall)
print(f"Precision: {precision}\nRecall: {recall}\nF1-score: {f1score}")

# 7. CALCUL DES MÉTRIQUES AVEC SKLEARN

In [None]:
y_true = []
y_pred = []

for batch in test.as_numpy_iterator():
    batch_x, batch_y = batch
    yhat = model.predict(batch_x)
    y_true.append(batch_y)
    y_pred.append(yhat)

# Conversion en tableaux numpy
y_true = np.vstack(y_true)
y_pred = np.vstack(y_pred)
# Seuil de 0.5 pour binariser les prédictions
y_pred_bin = (y_pred > 0.5).astype(int)

# Calcul par label
precision_per_label = precision_score(y_true, y_pred_bin, average=None)
recall_per_label = recall_score(y_true, y_pred_bin, average=None)
f1_per_label = f1_score(y_true, y_pred_bin, average=None)

print(f"Precision par label: {precision_per_label}")
print(f"Recall par label: {recall_per_label}")
print(f"F1-score par label: {f1_per_label}")

# F1-score global
f1_macro = f1_score(y_true, y_pred_bin, average="macro")
f1_micro = f1_score(y_true, y_pred_bin, average="micro")
print(f"F1-score macro (moyenne): {f1_macro}")
print(f"F1-score micro (global): {f1_micro}")

# 8. SAUVEGARDE ET CHARGEMENT DU MODÈLE

In [None]:
model.save('Toxicity.h5')
model = tf.keras.models.load_model('Toxicity.h5')
print("Modèle chargé :", model)

# 9. FONCTION DE SCORING D'UN COMMENTAIRE

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    text = ""
    
    for idx, col in enumerate(df.columns[2:]):
        text += f'{col}: {results[0][idx] > 0.5}\n'
    return text

# Exemple d'utilisation de la fonction
print(score_comment("Fuck you"))

# 10. ÉVALUATION COMPLÉMENTAIRE AVEC ACCURACY_SCORE

In [None]:
txt = "I hate you"
vec_txt = vectorizer(txt)
print("Prédiction pour 'I hate you' :", model.predict(np.expand_dims(vec_txt, 0)))
print("Labels :", df.columns[2:])

# Préparation des données de test pour évaluer l'accuracy
x_test = np.expand_dims(vec_txt.numpy(), 0)
y_test = [[1, 0, 0, 0, 0, 0]]
for batch in test.as_numpy_iterator():
    batch_x, batch_y = batch
    x_test = np.concatenate((x_test, batch_x))
    y_test = np.concatenate((y_test, batch_y))
yhat = model.predict(x_test)
yhat = (yhat > 0.5).astype(int)
print("Accuracy :", accuracy_score(np.array(y_test).flatten(), yhat.flatten()))