# Importation des packages

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
#import tensorflow_addons as tfa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Samy
[nltk_data]     Bouhelassa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samy
[nltk_data]     Bouhelassa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samy
[nltk_data]     Bouhelassa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Importation des données

Ajoutez un raccourci de ce dossier à votre google drive :

https://drive.google.com/drive/folders/1mx-CAzT10YKrmxHfYDP_1Oef7PVGUr7s?usp=sharing

In [26]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [40]:
# Lecture des données
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# On ne garde que les 5000 premiers exemples (pour les tests)
train = train[:15000]

# On supprime la colonne 'id' qui ne nous sert pas
train = train.drop('id', axis=1)

# Nombre de commentaires possédant un nombre de catégories
labels = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
labels = np.sum(labels, axis=1)

# Etude du jeu de données

In [28]:
# Your Code

# On affiche le nombre de commentaires de chaque catégorie
train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum()

toxic            1439
severe_toxic      158
obscene           789
threat             50
insult            749
identity_hate     133
dtype: int64

In [29]:
# Affichage d'un commentaire aléatoire pour chaque catégorie

for column in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    sample = data[data[column] == 1]['comment_text'].sample(1)
    print(f"Commentaire pour la colonne {column} :\n\"{sample}\"\n")

Commentaire pour la colonne toxic :
"155487    know that Salivo sucks
Name: comment_text, dtype: object"

Commentaire pour la colonne severe_toxic :
"82009    you're a dirty faggot \n\nlick my hairy nut sa...
Name: comment_text, dtype: object"

Commentaire pour la colonne obscene :
"144671    Skins edits \n\nI see you went a bit mental ov...
Name: comment_text, dtype: object"

Commentaire pour la colonne threat :
"155102    , death to vandalist of this Maratha sport
Name: comment_text, dtype: object"

Commentaire pour la colonne insult :
"88605    I hate Season 4. If there's one thing I hate m...
Name: comment_text, dtype: object"

Commentaire pour la colonne identity_hate :
"89577    Both of these two people are Gay kids and need...
Name: comment_text, dtype: object"



# Préparation des données

In [30]:
# On met les commentaires en minuscules
train['comment_text'] = train['comment_text'].str.lower()

# Affichage des données
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


In [31]:
# On supprime les caractères spéciaux et on sépare les mots avec nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')
train['comment_text'] = data['comment_text'].apply(tokenizer.tokenize)

train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"[Explanation, Why, the, edits, made, under, my...",0,0,0,0,0,0
1,"[D, aww, He, matches, this, background, colour...",0,0,0,0,0,0
2,"[Hey, man, I, m, really, not, trying, to, edit...",0,0,0,0,0,0
3,"[More, I, can, t, make, any, real, suggestions...",0,0,0,0,0,0
4,"[You, sir, are, my, hero, Any, chance, you, re...",0,0,0,0,0,0


In [34]:
stopwords_list = set(stopwords.words('english'))

# Fonction pour supprimer les stopwords
def remove_stopwords(word_list):
    filtered_words = [word for word in word_list if word.lower() not in stopwords_list]
    return filtered_words

# Appliquer la fonction sur la colonne 'comment_text'
train['comment_text'] = train['comment_text'].apply(remove_stopwords)

train.head()


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"[Explanation, edits, made, username, Hardcore,...",0,0,0,0,0,0
1,"[aww, matches, background, colour, seemingly, ...",0,0,0,0,0,0
2,"[Hey, man, really, trying, edit, war, guy, con...",0,0,0,0,0,0
3,"[make, real, suggestions, improvement, wondere...",0,0,0,0,0,0
4,"[sir, hero, chance, remember, page]",0,0,0,0,0,0


In [36]:
# Initialiser le lemmatizer
lemmatizer = WordNetLemmatizer()

# Fonction pour lemmatiser les mots
def lemmatize_words(word_list):
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in word_list]
    return lemmatized_words

# Appliquer la fonction sur la colonne 'comment_text'
train['comment_text'] = train['comment_text'].apply(lemmatize_words)

# Afficher les premières lignes du DataFrame après la lemmatisation
train.head()


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"[explanation, edits, made, username, hardcore,...",0,0,0,0,0,0
1,"[aww, match, background, colour, seemingly, st...",0,0,0,0,0,0
2,"[hey, man, really, trying, edit, war, guy, con...",0,0,0,0,0,0
3,"[make, real, suggestion, improvement, wondered...",0,0,0,0,0,0
4,"[sir, hero, chance, remember, page]",0,0,0,0,0,0


In [38]:
# Enfin, on reconstitue les phrases
sentences = list(map(lambda x : " ".join(x), train['comment_text']))

# Affichage des données
sentences[:3]

['explanation edits made username hardcore metallica fan reverted vandalism closure gas voted new york doll fac please remove template talk page since retired 89 205 38 27',
 'aww match background colour seemingly stuck thanks talk 21 51 january 11 2016 utc',
 'hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info']

# Entraînement du modèle baseline

In [41]:
# Pour le premier modèle, on va utiliser un Random Forest Classifier
# Avant cela, on va vectoriser les phrases avec TF-IDF
vectorizer = TfidfVectorizer()

# On fit le vectorizer sur les phrases
vectorizer.fit(sentences)

# On transforme les phrases en vecteurs
vectors = vectorizer.transform(sentences)

# Séparation des données en train et test
train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.1, random_state=0)

# Affichage des dimensions des données
print(f"Train sequences shape : {train_vectors.shape}")
print(f"Test sequences shape : {test_vectors.shape}")
print(f"Train labels shape : {train_labels.shape}")
print(f"Test labels shape : {test_labels.shape}")

Train sequences shape : (13500, 42589)
Test sequences shape : (1500, 42589)
Train labels shape : (13500,)
Test labels shape : (1500,)


In [42]:
# Maintenant, on peut créer le modèle
model = RandomForestClassifier(n_estimators=15,random_state=0)

# Entrainement du modèle
model.fit(train_vectors, train_labels)

# Prédiction sur les données de test
predictions = model.predict(test_vectors)

# Affichage de l'accuracy
print(f"Accuracy : {accuracy_score(test_labels, predictions)}")

Accuracy : 0.9133333333333333


# Itération de la modélisation 

In [None]:
# Your Code 