Ce notebook utilise un modèle glove google préentrainé téléchargé ici: http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/. 
Notebook inspiré de https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout/notebook

In [1]:
import pandas as pd
import numpy as np
import configparser

# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, GlobalMaxPool1D, Bidirectional, Embedding, Dropout
from keras.models import Model

# gensim
from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


# Chargement des données

In [2]:
config = configparser.ConfigParser()
config.read('../config.cfg')
EMBEDDING_FILE=config['FILES']['GOOGLE']
TRAIN_DATA_FILE=config['FILES']['TRAIN']
TEST_DATA_FILE=config['FILES']['TEST']

In [6]:
# Chargement des données sous forme de dataframes pandas
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

# Définition des différents labels disponibles sous forme de liste
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y = train[list_classes].values
list_sentences_train = train["comment_text"].fillna("_na_").values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [3]:
embed_size = 300 # taille du vecteur
max_features = 20000 # nombre de mots uniques utilisés (i.e nombre de lignes dans le vecteur d'embedding)
maxlen = 100 # nombre maximum de mots à considérer dans un commentaire

# Tokenisation

In [7]:
# On garde les mots les plus fréquents dans le jeu d'entrainement pour établir notre liste de tokens.
tokenizer = Tokenizer(num_words=max_features)
# Calcul des mots les plus fréquents
tokenizer.fit_on_texts(list(list_sentences_train))
# Indexation des jeux d'entrainement et test, conversion du texte en séquence d'indexes
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

La documentation de Tokenizer: https://keras.io/preprocessing/text/

In [None]:
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [9]:
word_vectors = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [17]:
word_index = tokenizer.word_index
vocabulary_size=min(len(word_index)+1,max_features)

embedding_matrix = np.zeros((vocabulary_size, embed_size))


for word, i in word_index.items():
    if i>=max_features:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector    
    except KeyError:
        vec = np.zeros(embed_size)
        embedding_matrix[i]=vec

In [25]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ffa0542c208>

In [27]:
test_label = pd.read_csv(config['FILES']['LABEL'])
test_label_strip = test_label[test_label.toxic != -1]
yt = test_label_strip[list_classes].values

In [28]:
model.evaluate(X_te[test_label_strip.index], yt, batch_size=1024)



[0.06758899882982207, 0.9717455559372511]