In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization

In [2]:
df = pd.read_csv(
        os.path.join('train.csv', 'train_200.csv')
    )

X = df['texte']
y = df[df.columns[2:]].values
MAX_FEATURES = 200000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                                output_sequence_length=1800,
                                output_mode='int')
vectorizer.adapt(X.values)
vectorizer_text = vectorizer(X.values)
dataset = tf.data.Dataset.from_tensor_slices((vectorizer_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(1600)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', optimizer='Adam')
model.summary()
history = model.fit(train, epochs=50, validation_data=val)
history.history

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 4)                 516       
                                                        

{'loss': [0.6649811863899231,
  0.500607967376709,
  0.4876873195171356,
  0.46599408984184265,
  0.4514164328575134,
  0.45375266671180725,
  0.416542649269104,
  0.3663748502731323,
  0.3174492418766022,
  0.26644590497016907,
  0.20831318199634552,
  0.18650926649570465,
  0.15563660860061646,
  0.10968440026044846,
  0.1250646710395813,
  0.0951600968837738,
  0.10494542866945267,
  0.10835441946983337,
  0.10175912082195282,
  0.07761300355195999,
  0.08502419292926788,
  0.08419961482286453,
  0.08926379680633545,
  0.07392778992652893,
  0.08794969320297241,
  0.09176084399223328,
  0.08852925151586533,
  0.06862760335206985,
  0.07614513486623764,
  0.07952173054218292,
  0.07967551797628403,
  0.0627032145857811,
  0.06753960251808167,
  0.06392178684473038,
  0.05057630315423012,
  0.1283441185951233,
  0.1261080950498581,
  0.10892543196678162,
  0.06394679844379425,
  0.05830276012420654,
  0.043520018458366394,
  0.03916442021727562,
  0.04511323943734169,
  0.031726598739

In [23]:
proposition = vectorizer('ça fait une video trop drole' )


In [6]:
df.columns[2:]

Index(['toxique', 'obscene', 'menace', 'insulte'], dtype='object')

In [24]:
model.predict(np.expand_dims(proposition,0))



array([[2.0073911e-03, 1.3522818e-20, 2.1658478e-15, 1.9075213e-09]],
      dtype=float32)

In [25]:
res = (model.predict(np.expand_dims(proposition,0)) > 0.5).astype(int)



In [26]:
res


array([[0, 0, 0, 0]])