# Classification des Commentaires Toxiques (Simple RNN)

Ce notebook implémente un **modèle RNN simple** pour classifier les commentaires comme **toxique ou non toxique**, en utilisant TensorFlow et Keras.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
    

## 1. Chargement des données

In [None]:

df = pd.read_csv('/mnt/data/train.csv')
print(df.head())
print(df.info())
print(df.describe())
    

## 2. Préparation des labels binaires

In [None]:

# Création d'une colonne binaire: 1 si un commentaire est toxique, 0 sinon
df['toxic_label'] = (df.iloc[:, 2:].sum(axis=1) > 0).astype(int)
    

## 3. Prétraitement du texte

In [None]:

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.strip()
    return text

df['comment_text'] = df['comment_text'].apply(clean_text)
    

## 4. Tokenization et Séquencement

In [None]:

tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['comment_text'])

sequences = tokenizer.texts_to_sequences(df['comment_text'])
X = pad_sequences(sequences, maxlen=100)
y = df['toxic_label'].values
    

## 5. Séparation des données

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

## 6. Définition du modèle Simple RNN

In [None]:

input_layer = Input(shape=(100,))
embedding = Embedding(input_dim=20000, output_dim=50, input_length=100)(input_layer)
rnn = SimpleRNN(32, activation='relu')(embedding)
dense = Dense(16, activation='relu')(rnn)
output_layer = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=input_layer, outputs=output_layer)

# Compilation du modèle
model.compile(
    loss='binary_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
    metrics=['accuracy']
)

model.summary()
    

## 7. Entraînement du modèle

In [None]:

history = model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))
    

## 8. Sauvegarde du modèle et du tokenizer

In [None]:

# Sauvegarde du modèle
model.save("/mnt/data/model_toxic_comment_rnn.h5")

# Sauvegarde du tokenizer
import pickle
with open("/mnt/data/tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

## 9. Évaluation du modèle

In [None]:

y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))
    

## 10. Pipeline de prédiction

In [None]:

def predict_comment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=100)
    prediction = model.predict(padded)[0][0]
    return "Toxique" if prediction > 0.5 else "Non Toxique"

# Exemple de test
print(predict_comment("This is a bad comment!"))
    