# Importation des packages

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import os

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from joblib import dump

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout

# Importation des données

Ajoutez un raccourci de ce dossier à votre google drive :

https://drive.google.com/drive/folders/1mx-CAzT10YKrmxHfYDP_1Oef7PVGUr7s?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

KeyboardInterrupt: 

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/data_classification_commentaires_toxiques/train.csv')
data_test = pd.read_csv('/content/drive/MyDrive/data_classification_commentaires_toxiques/test.csv')
data_test_labels = pd.read_csv('/content/drive/MyDrive/data_classification_commentaires_toxiques/test_labels.csv')
data_test = pd.merge(data_test, data_test_labels, on = ['id'])

# Etude du jeu de données

## Jeu de données d'entrainements


In [None]:
data_train[20:30]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
20,000b08c464718505,"""\n\n Regarding your recent edits \n\nOnce aga...",0,0,0,0,0,0
21,000bfd0867774845,"""\nGood to know. About me, yeah, I'm studying ...",0,0,0,0,0,0
22,000c0dfd995809fa,"""\n\n Snowflakes are NOT always symmetrical! \...",0,0,0,0,0,0
23,000c6a3f0cd3ba8e,"""\n\n The Signpost: 24 September 2012 \n\n Rea...",0,0,0,0,0,0
24,000cfee90f50d471,"""\n\nRe-considering 1st paragraph edit?\nI don...",0,0,0,0,0,0
25,000eefc67a2c930f,Radial symmetry \n\nSeveral now extinct lineag...,0,0,0,0,0,0
26,000f35deef84dc4a,There's no need to apologize. A Wikipedia arti...,0,0,0,0,0,0
27,000ffab30195c5e1,"Yes, because the mother of the child in the ca...",0,0,0,0,0,0
28,0010307a3a50a353,"""\nOk. But it will take a bit of work but I ca...",0,0,0,0,0,0
29,0010833a96e1f886,"""== A barnstar for you! ==\n\n The Real Life ...",0,0,0,0,0,0


In [None]:
number_of_lines = data_train['toxic'].count()
print('Nombre de données totale : ', number_of_lines)

Nombre de donnée totale :  159571


In [None]:
print( "Proportion de données toxique (en nombre) :\n",
      data_train.drop(columns=['id', 'comment_text']).sum()
)

Proportion de données toxique (en nombre) :
 toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [None]:
print( "Proportion de données (en pourcentage) :\n",
      (data_train.drop(columns=['id', 'comment_text']).sum() / number_of_lines * 100 ).round(3)
)

Proportion de données (en pourcentage) :
 toxic            9.584
severe_toxic     1.000
obscene          5.295
threat           0.300
insult           4.936
identity_hate    0.880
dtype: float64


## Jeu de données de tests

In [None]:
data_test[20:30]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
20,00084da5d4ead7aa,==Indefinitely blocked== \n I have indefinitel...,-1,-1,-1,-1,-1,-1
21,00091c35fa9d0465,"== Arabs are committing genocide in Iraq, but ...",1,0,0,0,0,0
22,000968ce11f5ee34,Please stop. If you continue to vandalize Wiki...,0,0,0,0,0,0
23,0009734200a85047,== Energy == \n\n I have edited the introduct...,0,0,0,0,0,0
24,00097b6214686db5,":yeah, thanks for reviving the tradition of pi...",-1,-1,-1,-1,-1,-1
25,0009aef4bd9e1697,"MLM Software,NBFC software,Non Banking Financi...",-1,-1,-1,-1,-1,-1
26,000a02d807ae0254,"@RedSlash, cut it short. If you have sources s...",0,0,0,0,0,0
27,000a6c6d4e89b9bc,==================== \n Deception is the way o...,-1,-1,-1,-1,-1,-1
28,000bafe2080bba82,. \n\n Jews are not a race because y...,0,0,0,0,0,0
29,000bf0a9894b2807,:::If Ollie or others think that one list of t...,0,0,0,0,0,0


On remarque que plusieurs commentaires ont des -1, cela signifie ques ces commentaires ne sont pas utilisés. Nous allons donc les supprimer du jeu de données tests.

In [None]:
number_of_lines = data_test['toxic'].count()
print('Nombre de données totale : ', number_of_lines)

Nombre de données totale :  153164


In [None]:
number_of_lines = data_test[data_test['toxic'] >= 0]['toxic'].count()
print('Nombre de données totales exploitables (sans les -1) : ', number_of_lines)

Nombre de données totales exploitables (sans les -1) :  63978


In [None]:
print( "Proportion de données toxique(en nombre) :\n",
      data_test.drop(columns=['id', 'comment_text'])[data_test['toxic'] >= 0].sum()
)

Proportion de données toxique(en nombre) :
 toxic            6090
severe_toxic      367
obscene          3691
threat            211
insult           3427
identity_hate     712
dtype: int64


In [None]:
print( "Proportion de données toxique (en pourcentage) :\n",
      (data_test.drop(columns=['id', 'comment_text'])[data_test['toxic'] >= 0].sum() / number_of_lines * 100 ).round(3)
)

Proportion de données toxique (en pourcentage) :
 toxic            3.976
severe_toxic     0.240
obscene          2.410
threat           0.138
insult           2.237
identity_hate    0.465
dtype: float64


# Préparation des données

## Préparation des Fonctions pour nettoyer les données

In [None]:
# Télécharger les ressources nécessaires de nltk
nltk.download('punkt')
nltk.download('stopwords')

# Préparer le tokenizer et les stopwords
tokenizer = nltk.tokenize.TweetTokenizer()
stop_words = set(stopwords.words('english'))
corpus = []

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# @title Liste clean tweet

def clean(tweet):

    # Contractions
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"We're", "We are", tweet)
    tweet = re.sub(r"That's", "That is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"Can't", "Cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
    tweet = re.sub(r"aren't", "are not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"What's", "What is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"There's", "There is", tweet)
    tweet = re.sub(r"He's", "He is", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"You're", "You are", tweet)
    tweet = re.sub(r"I'M", "I am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "I am", tweet)
    tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
    tweet = re.sub(r"I'm", "I am", tweet)
    tweet = re.sub(r"Isn't", "is not", tweet)
    tweet = re.sub(r"Here's", "Here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
    tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
    tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
    tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
    tweet = re.sub(r"would've", "would have", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
    tweet = re.sub(r"We've", "We have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"Y'all", "You all", tweet)
    tweet = re.sub(r"Weren't", "Were not", tweet)
    tweet = re.sub(r"Didn't", "Did not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"DON'T", "DO NOT", tweet)
    tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"They're", "They are", tweet)
    tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
    tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
    tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i've", "I have", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"I've", "I have", tweet)
    tweet = re.sub(r"Don't", "do not", tweet)
    tweet = re.sub(r"I'll", "I will", tweet)
    tweet = re.sub(r"I'd", "I would", tweet)
    tweet = re.sub(r"Let's", "Let us", tweet)
    tweet = re.sub(r"you'd", "You would", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"Ain't", "am not", tweet)
    tweet = re.sub(r"Haven't", "Have not", tweet)
    tweet = re.sub(r"Could've", "Could have", tweet)
    tweet = re.sub(r"youve", "you have", tweet)
    tweet = re.sub(r"donå«t", "do not", tweet)

    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"yrs", "years", tweet)
    tweet = re.sub(r"hrs", "hours", tweet)
    tweet = re.sub(r"2morow|2moro", "tomorrow", tweet)
    tweet = re.sub(r"2day", "today", tweet)
    tweet = re.sub(r"4got|4gotten", "forget", tweet)
    tweet = re.sub(r"b-day|bday", "b-day", tweet)
    tweet = re.sub(r"mother's", "mother", tweet)
    tweet = re.sub(r"mom's", "mom", tweet)
    tweet = re.sub(r"dad's", "dad", tweet)
    tweet = re.sub(r"hahah|hahaha|hahahaha", "haha", tweet)
    tweet = re.sub(r"lmao|lolz|rofl", "lol", tweet)
    tweet = re.sub(r"thanx|thnx", "thanks", tweet)
    tweet = re.sub(r"goood", "good", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    # Character entity references
    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet)
    # Typos, slang and informal abbreviations
    tweet = re.sub(r"w/e", "whatever", tweet)
    tweet = re.sub(r"w/", "with", tweet)
    tweet = re.sub(r"<3", "love", tweet)
    # Urls
    tweet = re.sub(r"http\S+", "", tweet)
    # Numbers
    tweet = re.sub(r'[0-9]', '', tweet)
    # Eliminating the mentions
    tweet = re.sub("(@[A-Za-z0-9_]+)","", tweet)
    # Remove punctuation and special chars (keep '!')
    for p in string.punctuation.replace('!', ''):
        tweet = tweet.replace(p, '')

    # ... and ..
    tweet = tweet.replace('...', ' ... ')
    if '...' not in tweet:
        tweet = tweet.replace('..', ' ... ')

    # Tokenize
    tweet_words = tokenizer.tokenize(tweet)

    # Eliminating the word if its length is less than 3
    tweet = [w for w in tweet_words if len(w)>2]

    # remove stopwords
    tweet = [w.lower() for w in tweet if not w in stop_words]

    corpus.append(tweet)

    # join back
    tweet = ' '.join(tweet)


    return tweet

In [None]:
# @title Liste Abbréviation
variable_name = ""
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk",
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart",
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet",
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
     "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously",
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

def convert_abbrev_in_text(tweet):
    t=[]
    words=tweet.split()
    t = [abbreviations[w.lower()] if w.lower() in abbreviations.keys() else w for w in words]
    return ' '.join(t)

In [None]:
def prepare_string(tweet):
  tweet = clean(tweet)
  tweet = convert_abbrev_in_text(tweet)
  return tweet

## Nettoyage des données

In [None]:
%%time
# Apply prepare_string to all rows in 'tweets' column
data_train['comment_text'] = data_train['comment_text'].apply(lambda s : prepare_string(s))
# Drop empty values from dataframe
data_train['comment_text'].replace('', np.nan, inplace=True)
data_train.dropna(subset=['comment_text'], inplace=True)

CPU times: user 1min 48s, sys: 639 ms, total: 1min 49s
Wall time: 1min 50s


In [None]:
%%time
# Apply prepare_string to all rows in 'tweets' column
data_test['comment_text'] = data_test['comment_text'].apply(lambda s : prepare_string(s))
# Drop empty values from dataframe
data_test['comment_text'].replace('', np.nan, inplace=True)
data_test.dropna(subset=['comment_text'], inplace=True)

CPU times: user 1min 39s, sys: 616 ms, total: 1min 39s
Wall time: 1min 41s


In [None]:
data_train.drop(columns=["id"], inplace=True) #We drop "id" column from dataframe
data_test.drop(columns=["id"], inplace=True)

In [None]:
data_test.drop(
    data_test[ data_test['toxic'] == -1 ].index,
    inplace = True
)

In [None]:
data_train.to_csv('/content/drive/MyDrive/clean_comments_train.csv', index=False) #We store our datas in a csv file in our drive
data_test.to_csv('/content/drive/MyDrive/clean_comments_test.csv', index=False)

## Récupérer les données déjà nettoyer


In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/clean_comments_train.csv') #Récupérer nos datas nettoyer.
data_test = pd.read_csv('/content/drive/MyDrive/clean_comments_test.csv')

In [None]:
data_train[50:60]

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
50,said wanted talk bottom lead section written i...,0,0,0,0,0,0
51,get fucked get fuckeeed got drink that you can...,1,0,1,0,0,0
52,are threatening disputing neutrality know coun...,0,0,0,0,0,0
53,thanks undeletion would hoped researching stat...,0,0,0,0,0,0
54,awesome then simply disregard notice thanks,0,0,0,0,0,0
55,stupid peace shit stop deleting stuff asshole ...,1,1,1,0,1,0
56,tony sidaway obviously fistfuckee loves arm ass,1,0,1,0,1,0
57,review sorry say fail articles gan the several...,0,0,0,0,0,0
58,band pages deletion you thought gone deleting ...,1,0,1,0,0,0
59,why cannot believe fat artie did see recent ap...,1,0,0,0,0,0


# Entraînement du modèle baseline

## Modèle RandomForest

In [28]:
# Prétraitement et vectorisation des données textuelles
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Entraînement du modèle RandomForest
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42, verbose=3, n_jobs=-1)

# Création d'un pipeline pour automatiser le workflow
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('clf', rf_classifier),
])

# Séparation des caractéristiques et des étiquettes
X_train = data_train['comment_text']
y_train = data_train.drop('comment_text', axis=1)
X_test = data_test['comment_text']
y_test = data_test.drop('comment_text', axis=1)

# Entraînement du modèle
pipeline.fit(X_train, y_train)

# Évaluation du modèle
predictions = pipeline.predict(X_test)
report = classification_report(y_test, predictions, target_names=y_train.columns)

# Sauvegarde du modèle
joblib_file_path = "/content/drive/My Drive/random_forest_classifier.joblib"
dump(pipeline, joblib_file_path)

# Affichage du rapport de classification
print(report)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 10building tree 2 of 10

building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    1.8s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

        toxic       0.61      0.69      0.65      6087
 severe_toxic       0.21      0.06      0.09       367
      obscene       0.65      0.57      0.61      3688
       threat       0.31      0.04      0.08       211
       insult       0.61      0.45      0.52      3424
identity_hate       0.64      0.17      0.27       712

    micro avg       0.62      0.55      0.58     14489
    macro avg       0.50      0.33      0.37     14489
 weighted avg       0.61      0.55      0.57     14489
  samples avg       0.06      0.05      0.05     14489



## Explication Chat-GPT (A changer !!!)

Pour ce qui est de l'interprétation des résultats, voici ce que nous pouvons dire des métriques de précision, rappel et score F1 de votre rapport de classification :

Précision (Precision) : La capacité du modèle à ne pas classer comme positive une instance qui est négative. Par exemple, pour toxic, une précision de 0.61 signifie que 61% des commentaires que le modèle a prédits comme toxiques étaient réellement toxiques.

Rappel (Recall) : La capacité du modèle à trouver toutes les instances positives. Pour toxic, un rappel de 0.69 signifie que le modèle a correctement identifié 69% des commentaires toxiques.

Score F1 : La moyenne harmonique de la précision et du rappel. C'est une mesure de l'exactitude du test. Un score F1 de 0.65 pour toxic signifie que le modèle a un bon équilibre entre la précision et le rappel pour cette classe.

Support : Le nombre de vraies occurrences de chaque classe dans le jeu de données. Cela vous montre combien de fois chaque catégorie est représentée. Par exemple, toxic apparaît 6087 fois dans le jeu de données de test.

Les métriques globales (micro/macro/weighted avg) fournissent une image de la performance générale du modèle :

Micro avg : Calcule la métrique globale en comptant le total des vrais positifs, faux négatifs et faux positifs.
Macro avg : Calcule la moyenne des métriques pour chaque classe, sans pondération pour le support de classe.
Weighted avg : Prend en compte le support de chaque classe pour le calcul de la moyenne.
En regardant ces métriques, vous pouvez voir que le modèle est relativement bon pour identifier les commentaires toxic et obscene, mais il a du mal avec des catégories moins fréquentes comme severe_toxic, threat, et identity_hate. Le "samples avg" très faible suggère que le modèle pourrait ne pas bien fonctionner pour les cas où plusieurs étiquettes sont vraies pour un même commentaire.

Les avertissements UndefinedMetricWarning indiquent que certaines classes n'ont pas eu de prédictions positives (zero_division par défaut est 0), ou que pour certaines classes il n'y avait pas de vrais positifs, ce qui peut biaiser la moyenne pour ces métriques.

# Itération de la modélisation

## Utilisation d'embedding pré-entrainés (GloVe)

### Prépare nos données

In [None]:
# Préparer le tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data_train['comment_text'])

# Convertir les textes en séquences
sequences_train = tokenizer.texts_to_sequences(data_train['comment_text'])
sequences_test = tokenizer.texts_to_sequences(data_test['comment_text'])

# Padding des séquences pour avoir la même longueur
# Choisissez une longueur max ou utilisez la longueur du commentaire le plus long
max_length = max(max(len(seq) for seq in sequences_train), max(len(seq) for seq in sequences_test))

padded_train = pad_sequences(sequences_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(sequences_test, maxlen=max_length, padding='post')

In [None]:
padded_train_df = pd.DataFrame(padded_train)
print(padded_train_df)
padded_test_df = pd.DataFrame(padded_test)
print(padded_test_df)

        0     1     2     3     4     5     6     7     8     9     ...  3740  \
0        529   135    47    51   520  4371     1   926   213   201  ...     0   
1          1  2317  1275  3437  4043  2392    24     8   801    91  ...     0   
2        318   330    54   147    19   216   476  1981   355   367  ...     0   
3       1041    62    27   228  1247  1828  5343    46  2238   379  ...     0   
4          6  1488  3224   631   859   422     4     0     0     0  ...     0   
...      ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
159508    23   304    12   707    85   414  5202  1374   165    35  ...     0   
159509     6  4093    61  2948   134   105     8     4     0     0  ...     0   
159510     1  7055  4469   558     2  7435  3274     1  4205     0  ...     0   
159511    23   424    11   112   105   153    39   267    58    67  ...     0   
159512    23    54    15   150   437   285   233    66   370    63  ...     0   

        3741  3742  3743  3

In [24]:
X_train = padded_train
y_train = data_train.drop(columns=['comment_text'])
X_test = padded_test
y_test = data_test.drop(columns=['comment_text'])

In [25]:
# Définir la taille du sous-ensemble car trop de données pour la suite
subset_size = 20000

# Créer un sous-ensemble aléatoire de l'ensemble d'entraînement
indices = np.random.choice(range(len(padded_train)), subset_size, replace=False)
padded_train_subset = padded_train[indices]
y_train_subset = y_train.iloc[indices]

# même chose pour l'ensemble de test
test_indices = np.random.choice(range(len(padded_test)), subset_size, replace=False)
padded_test_subset = padded_test[test_indices]
y_test_subset = y_test.iloc[test_indices]

### Charger GloVe

In [9]:
path_to_glove_file = '/content/drive/My Drive/glove.6B/glove.6B.100d.txt'

In [10]:
# Chargement des embeddings GloVe
embeddings_index = {}
with open(path_to_glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

### Création de notre matrice d'embedding

In [11]:
# Création de la matrice d'embeddings
embedding_dim = 100
num_tokens = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [12]:
# Initialisation de la couche d'embedding
embedding_layer = Embedding(num_tokens,
                            embedding_dim,
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            trainable=False)

### Construction du modèle et entrainement

In [13]:
embedding_dim = 100

model = Sequential()
# Ajout de la couche d'embedding pré-entraînée
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_length,
                    trainable=False))  # Important de ne pas entraîner cette couche pour garder les embeddings GloVe

# Aplatir la sortie pour la connecter à des couches denses
model.add(Flatten())

# Ajout d'une couche dense simple avec une activation ReLU
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))  # Dropout pour réduire le surajustement

# Couche de sortie
model.add(Dense(6, activation='softmax'))

# Compilation du modèle
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [26]:
batch_size = 64
# On utilise notre sous-ensemble pour réduire le temps d'entrainement
model.fit(padded_train_subset, y_train_subset, epochs=10,batch_size = batch_size, validation_data=(padded_test_subset, y_test_subset))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7dc98e984490>

In [27]:
# Évaluation du modèle sur l'ensemble de test
loss, accuracy = model.evaluate(padded_test_subset, y_test_subset)

print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Test Loss: 10.190613746643066
Test Accuracy: 0.1350499987602234
