In [3]:
# Import des bibliothèques

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [6]:
# Import des données (via Kaggle : https://www.kaggle.com/mrinaal007/hate-speech-detection)

data_train = pd.read_csv('/content/toxic_train.csv')
data_test = pd.read_csv('/content/toxic_test.csv')
data_train

Unnamed: 0.1,Unnamed: 0,comment_text,toxic
0,0,Explanation\r\nWhy the edits made under my use...,0.0
1,1,D'aww! He matches this background colour I'm s...,0.0
2,2,"Hey man, I'm really not trying to edit war. It...",0.0
3,3,"""\r\nMore\r\nI can't make any real suggestions...",0.0
4,4,"You, sir, are my hero. Any chance you remember...",0.0
...,...,...,...
78520,78520,Have a look at this which describes how the Al...,0.0
78521,78521,I'm not aware of a requirement for a governmen...,0.0
78522,78522,"""\r\n\r\nooh rah, see the noticeboard too. — ...",0.0
78523,78523,"""\r\n\r\n""""To state that is not correct is con...",0.0


In [None]:
# Séparation en jeux de train, test, validation

y_train = data_train['toxic']
y_test = data_test['toxic']

X_train = data_train['comment_text']

batch_size = 1000 # 64

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train, y_train = X_train[batch_size:15000], y_train[batch_size:15000]
# X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

In [None]:
# Text to sequence
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False,
                                               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\r\n')
                                               
tokenizer.fit_on_texts(X_train)

In [None]:
# Application des tokens sur les différents jeux

X_train = tokenizer.texts_to_sequences(X_train) #  Words are ranked by how often they occur (in the training set)
X_valid = tokenizer.texts_to_sequences(X_valid) #  Words are ranked by how often they occur (in the training set)
X_test = tokenizer.texts_to_sequences(data_test['comment_text']) #  Words are ranked by how often they occur (in the training set)

print(data_train['comment_text'][0])
print(X_train[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
[5838, 8983, 141, 324, 5, 1665, 15, 5838, 8983, 34, 5838, 8983, 605, 436, 78, 226, 7, 68, 35, 11, 8, 9, 220, 28, 15429, 15430, 4, 512, 22301, 748, 41, 49, 168, 216, 6, 195, 68, 35, 11, 1308, 28, 3896, 23, 5, 15431, 7, 606, 10, 12197, 474]


In [None]:
word_index = tokenizer.word_index
id_to_word = {id + 2 : word for word, id in word_index.items()}

for id, token in enumerate(('<pad>', '<sos>', '<unk>')):
  id_to_word[id] = token

print(id_to_word[0], id_to_word[1], id_to_word[2])

<pad> <sos> <unk>


In [None]:
# Liste des tokens
id_to_word

{3: 'the',
 4: 'to',
 5: 'of',
 6: 'and',
 7: 'a',
 8: 'you',
 9: 'i',
 10: 'is',
 11: 'that',
 12: 'in',
 13: 'it',
 14: 'for',
 15: 'this',
 16: 'not',
 17: 'on',
 18: 'be',
 19: 'as',
 20: 'are',
 21: 'have',
 22: 'your',
 23: 'with',
 24: 'if',
 25: 'was',
 26: 'article',
 27: 'or',
 28: 'but',
 29: 'page',
 30: 'my',
 31: 'an',
 32: 'wikipedia',
 33: 'by',
 34: 'from',
 35: 'do',
 36: 'at',
 37: 'about',
 38: 'me',
 39: 'so',
 40: 'talk',
 41: 'can',
 42: 'what',
 43: 'there',
 44: 'has',
 45: 'all',
 46: 'no',
 47: 'will',
 48: 'would',
 49: 'one',
 50: 'he',
 51: 'please',
 52: 'like',
 53: 'just',
 54: 'they',
 55: 'any',
 56: 'which',
 57: 'been',
 58: 'should',
 59: "don't",
 60: 'more',
 61: 'other',
 62: 'see',
 63: 'his',
 64: 'here',
 65: 'who',
 66: 'we',
 67: 'some',
 68: 'also',
 69: 'because',
 70: 'know',
 71: 'am',
 72: 'think',
 73: 'how',
 74: 'edit',
 75: "i'm",
 76: 'up',
 77: 'why',
 78: "it's",
 79: 'out',
 80: 'only',
 81: 'use',
 82: 'people',
 83: 'then',
 

Padding

In [None]:
from keras.preprocessing import sequence

X_train = [[elt + 2 for elt in seq] for seq in X_train]
X_valid = [[elt + 2 for elt in seq] for seq in X_valid]
X_test = [[elt + 2 for elt in seq] for seq in X_test]

max_words = max(map(len, X_train)) # 1404

X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_valid = sequence.pad_sequences(X_valid, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [None]:
print(data_train['comment_text'][1])
print(X_train[1][1300:])

D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   198   529  2361 12200  2005    40]


In [None]:
for i in range(10):
  print(y_train.values[i])
  print(' '.join([id_to_word[id] for id in X_train[i]]))

0
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

In [None]:
# Création du RNN
model = keras.models.Sequential([
        # keras.layers.Embedding(input_dim=max(id_to_word.keys()), output_dim=32, input_length=max_words),
        keras.layers.Embedding(input_dim=max(id_to_word.keys())+1, output_dim=32, input_length=max_words),
        keras.layers.LSTM(50, return_sequences=True),
        keras.layers.LSTM(50),
        keras.layers.Dense(1, activation='sigmoid')])

print(model.summary()) 

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1403, 32)          1512352   
_________________________________________________________________
lstm_18 (LSTM)               (None, 1403, 50)          16600     
_________________________________________________________________
lstm_19 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 51        
Total params: 1,549,203
Trainable params: 1,549,203
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(14000, 1403) (14000,)
(1000, 1403) (1000,)


In [None]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                              patience=2, 
                                              restore_best_weights=True)

model.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=batch_size,
          epochs=2,
          callbacks=[early_stopping]) # 0.8950

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fdb3ce21128>

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1]) # 90.4 % des performances

Test accuracy: 0.9048110246658325


À tester :
* Stemming
* Remove stop word
* Clean sentence (lower case, ponctuation)
* Bag of word (pas de RNN mais un DNN)