In [76]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization

In [94]:
df = pd.read_csv(
        os.path.join('train.csv', 'trainself4.csv')
    )

X = df['texte']
y = df[df.columns[2:]].values
MAX_FEATURES = 200000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                                output_sequence_length=1800,
                                output_mode='int')
vectorizer.adapt(X.values)
vectorizer_text = vectorizer(X.values)
dataset = tf.data.Dataset.from_tensor_slices((vectorizer_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(1600)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', optimizer='Adam')
model.summary()
history = model.fit(train, epochs=1000, validation_data=val)
history.history

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 32)          6400032   
                                                                 
 bidirectional_12 (Bidirect  (None, 64)                16640     
 ional)                                                          
                                                                 
 dense_48 (Dense)            (None, 128)               8320      
                                                                 
 dense_49 (Dense)            (None, 256)               33024     
                                                                 
 dense_50 (Dense)            (None, 128)               32896     
                                                                 
 dense_51 (Dense)            (None, 4)                 516       
                                                     

{'loss': [0.690502405166626,
  0.6823310852050781,
  0.6677358150482178,
  0.649208128452301,
  0.6243315935134888,
  0.6064930558204651,
  0.5738067626953125,
  0.5595390796661377,
  0.5791306495666504,
  0.5790548324584961,
  0.5730195045471191,
  0.5226867198944092,
  0.5457615256309509,
  0.5568193793296814,
  0.5473740100860596,
  0.5412350296974182,
  0.5542010068893433,
  0.5201938152313232,
  0.49610304832458496,
  0.48506075143814087,
  0.4445682168006897,
  0.43731802701950073,
  0.3935573101043701,
  0.3557429909706116,
  0.3464123606681824,
  0.34253332018852234,
  0.3021036684513092,
  0.2505822777748108,
  0.2581701874732971,
  0.22209468483924866,
  0.17724847793579102,
  0.19559654593467712,
  0.174396812915802,
  0.14044323563575745,
  0.15613138675689697,
  0.09625798463821411,
  0.11756658554077148,
  0.12028180062770844,
  0.12403811514377594,
  0.09280731528997421,
  0.04701068252325058,
  0.07109232246875763,
  0.07385344803333282,
  0.03833570331335068,
  0.03912

In [100]:
proposition = vectorizer('tu es gentil')

In [96]:
df.columns[2:]

Index(['toxique', 'obscene', 'menace', 'insulte'], dtype='object')

In [101]:
model.predict(np.expand_dims(proposition,0))



array([[1.0000000e+00, 7.0913702e-05, 1.1869979e-06, 9.6190596e-01]],
      dtype=float32)

In [98]:
res = (model.predict(np.expand_dims(proposition,0)) > 0.5).astype(int)



In [99]:
res


array([[1, 0, 0, 0]])