In [58]:
# Importer les packages
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, GRU, Dropout
from sklearn.metrics import accuracy_score

# Importation des données

In [59]:
# Importer les données
path = './data/french_tweets.csv'

df = pd.read_csv(path)

# Afficher les données
print(df.head())

   label                                               text
0      0  - Awww, c'est un bummer. Tu devrais avoir davi...
1      0  Est contrarié qu'il ne puisse pas mettre à jou...
2      0  J'ai plongé plusieurs fois pour la balle. A ré...
3      0  Tout mon corps a des démangeaisons et comme si...
4      0  Non, il ne se comporte pas du tout. je suis en...


In [60]:
# Diminuons la taille du df
df = df.loc[:6000]

In [61]:
df.shape

(6001, 2)

In [62]:
print(df['text'][0])

- Awww, c'est un bummer. Tu devrais avoir david carr du troisième jour pour le faire. ;ré


# Nettoyage du df

In [63]:
!pip install contractions




[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





- Nettoyage des données

In [33]:
# Installation de spacy et téléchargement de fr_core_news_md
!python -m spacy download fr_core_news_md

Collecting fr-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.7.0/fr_core_news_md-3.7.0-py3-none-any.whl (45.8 MB)
     ---------------------------------------- 45.8/45.8 MB 2.7 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')



[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [64]:
import contractions
import spacy

nlp_spacy = spacy.load('fr_core_news_md')
Stopwords = stopwords.words('french')

def cleannig_tweet(text):
    expanded_all = []
    text = re.sub(r'http[s]*:?//\S+','', text)
    text = re.sub(r'@[\w\-\.]+', '', text)
    text = re.sub(r'[\w\-\.]+@[\w\-\.]+', '', text)
    text = re.sub(r'&\w+','', text)
    text = re.sub(r'[^a-zA-Z\s]+', ' ', text)
    text = re.sub(r'^\s|\s$', '', text)
    text = re.sub(r'\s{2,}', ' ', text).lower()
    
    
    for word in text.split():
        expanded_all.append(contractions.fix(word))
    text = ' '.join(expanded_all)
    
    
    text = ' '.join([word for word in text.split() if word not in Stopwords])

    tokens  = nlp_spacy(text)
    text = [word.lemma_ for word in tokens]
    text = " ".join(text)
    return text

In [65]:
# Appliquer la fonction cleannig_tweet
df['tweet'] = df.text.apply(func = cleannig_tweet)

In [66]:
print(df['text'][0])
print(df['tweet'][0])

- Awww, c'est un bummer. Tu devrais avoir david carr du troisième jour pour le faire. ;ré
awww bummer devoir avoir david carr troisi jour faire r


In [67]:
# Affichons le jeu de données
df.head()

Unnamed: 0,label,text,tweet
0,0,"- Awww, c'est un bummer. Tu devrais avoir davi...",awww bummer devoir avoir david carr troisi jou...
1,0,Est contrarié qu'il ne puisse pas mettre à jou...,contrari pouvoir mettre jour facebook maignant...
2,0,J'ai plongé plusieurs fois pour la balle. A ré...,plong plusieurs fois balle avoir r ussi conomi...
3,0,Tout mon corps a des démangeaisons et comme si...,tout corps avoir mangeaison comme si taire feu
4,0,"Non, il ne se comporte pas du tout. je suis en...",non comporte tout col r pourquoi ici parce pou...


- Vectorisation des tweets

In [68]:
# Vectorison avec Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Nombre de mots
nbr_word_unique = len(set(" ".join(df.tweet).split()))

# Initialiser le modèle Tokenizer
tokenizer = Tokenizer(num_words = nbr_word_unique, split=' ')

# Entrainer les données
tokenizer.fit_on_texts(df['tweet'].values)

# Vectoriser
vect_array = tokenizer.texts_to_sequences(df['tweet'].values)

# Padding
vect_array = pad_sequences(vect_array)

In [69]:
# Sauvegarder le tokenizer en JSON
tokenizer_json = tokenizer.to_json()
with open('french_tokenizer.json', 'w', encoding='utf-8') as json_file:
    json_file.write(tokenizer_json)

In [39]:
# Le nombre de mot uniques dans les tweets
nbr_word_unique

7682

# Définission des modèles

In [40]:
# Installons keras-tuner
!pip install -q -U keras-tuner


[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


- Séparation des données en train, test et validation

In [42]:
# Importer train_test_split
from sklearn.model_selection import train_test_split

# Labels
y = df.label.values

# Splitter en train et test
x_train, x_test, y_train, y_test = train_test_split(vect_array, y, test_size=0.2, random_state=42)

# Splitter validation et test
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [43]:
import tensorflow as tf
import keras_tuner as kt

In [44]:
# Définissons l'opitmizer
from keras.optimizers import Adam

In [45]:
def bidir_lstm_model_builder(hp):
    emb_dimention = hp.Int('emb_dimention', min_value=32, max_value=512, step=32)
    nbr_neurones = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    nbr_neurones_l2 = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001])
    dropout = hp.Boolean("dropout")
    
    Adam(learning_rate=learning_rate)
    
    bidir_lstm_model=Sequential()
    bidir_lstm_model.add(Embedding(nbr_word_unique, emb_dimention, input_length=vect_array.shape[1]))
    bidir_lstm_model.add(Bidirectional(LSTM(nbr_neurones, return_sequences=True)))
    if dropout :
        bidir_lstm_model.add(Dropout(0.4))
    bidir_lstm_model.add(Bidirectional(LSTM(nbr_neurones_l2)))
    if dropout: 
        bidir_lstm_model.add(Dropout(0.4))
    bidir_lstm_model.add(Dense(1,activation='sigmoid'))
    bidir_lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return bidir_lstm_model

In [46]:
def bidir_gru_model_builder(hp):
    emb_dimention = hp.Int('emb_dimention', min_value=32, max_value=512, step=32)
    nbr_neurones = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    nbr_neurones_l2 = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001])
    dropout = hp.Boolean("dropout")
    
    Adam(learning_rate=learning_rate)
    
    bidir_gru_model=Sequential()
    bidir_gru_model.add(Embedding(nbr_word_unique, emb_dimention, input_length=vect_array.shape[1]))
    bidir_gru_model.add(Bidirectional(GRU(nbr_neurones, return_sequences=True)))
    if dropout :
        bidir_gru_model.add(Dropout(0.4))
    bidir_gru_model.add(Bidirectional(GRU(nbr_neurones_l2)))
    if dropout: 
        bidir_gru_model.add(Dropout(0.4))
    bidir_gru_model.add(Dense(1,activation='sigmoid'))
    bidir_gru_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return bidir_gru_model

In [47]:
def lstm_model_builder(hp):
    emb_dimention = hp.Int('emb_dimention', min_value=32, max_value=512, step=32)
    nbr_neurones = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    nbr_neurones_l2 = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001])
    dropout = hp.Boolean("dropout")
    
    Adam(learning_rate=learning_rate)
    
    lstm_model=Sequential()
    lstm_model.add(Embedding(nbr_word_unique, emb_dimention, input_length=vect_array.shape[1]))
    lstm_model.add(LSTM(nbr_neurones, return_sequences=True))
    if dropout :
        lstm_model.add(Dropout(0.4))
    lstm_model.add(LSTM(nbr_neurones_l2))
    if dropout: 
        lstm_model.add(Dropout(0.4))
    lstm_model.add(Dense(1,activation='sigmoid'))
    lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return lstm_model

In [48]:
def gru_model_builder(hp):
    emb_dimention = hp.Int('emb_dimention', min_value=32, max_value=512, step=32)
    nbr_neurones = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    nbr_neurones_l2 = hp.Int('nbr_neurones', min_value=32, max_value=512, step=32)
    learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001])
    dropout = hp.Boolean("dropout")
    
    Adam(learning_rate=learning_rate)
    
    gru_model=Sequential()
    gru_model.add(Embedding(nbr_word_unique, emb_dimention, input_length=vect_array.shape[1]))
    gru_model.add(GRU(nbr_neurones, return_sequences=True))
    if dropout :
        gru_model.add(Dropout(0.4))
    gru_model.add(GRU(nbr_neurones_l2))
    if dropout: 
        gru_model.add(Dropout(0.4))
    gru_model.add(Dense(1,activation='sigmoid'))
    gru_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return gru_model

In [49]:
def tune_model(model_to_tune):
    # Mettons en place un rapel d'arrêt après avoir atteint une certaine valeur
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    
    # Instancions le tuner
    tuner = kt.RandomSearch(
        model_to_tune,
        objective='val_accuracy',
        max_trials = 2,
        overwrite = True,
        directory='tuners_dir',
        project_name='intro_to_kt'
    )
    
    # Exécutons la recherche des paramètres
    tuner.search(x_train, y_train, validation_data=(x_val, y_val), epochs=3, callbacks=[stop_early])
    
    return tuner

In [50]:
# Récupérons les models tunes
print("bi dir lstm")
bidir_lstm_tuner = tune_model(bidir_lstm_model_builder)

print("\nbi dir gru")
bidir_gru_tuner = tune_model(bidir_gru_model_builder)

print("\nlstm")
lstm_tuner = tune_model(lstm_model_builder)

print("\n gru")
gru_tuner = tune_model(gru_model_builder)

Trial 2 Complete [00h 00m 45s]
val_accuracy: 1.0

Best val_accuracy So Far: 1.0
Total elapsed time: 00h 05m 13s


In [51]:
# Construisons les modèles
bidir_lstm_model = bidir_lstm_tuner.hypermodel.build(bidir_lstm_tuner.get_best_hyperparameters(num_trials=1)[0])
bidir_gru_model = bidir_gru_tuner.hypermodel.build(bidir_gru_tuner.get_best_hyperparameters(num_trials=1)[0])
lstm_model = lstm_tuner.hypermodel.build(lstm_tuner.get_best_hyperparameters(num_trials=1)[0])
gru_model = gru_tuner.hypermodel.build(gru_tuner.get_best_hyperparameters(num_trials=1)[0])

In [52]:
models = [
    {"name": "bidirectional LSTM", "model": bidir_lstm_model, "score": 0},
    {"name": "LSTM", "model": lstm_model, "score": 0},
    {"name": "bidirectional GRU", "model": bidir_gru_model, "score": 0},
    {"name": "GRU", "model": gru_model, "score": 0}
]

In [53]:
for model in models:
    print(model["name"])
    print(model["model"].summary())
    print('\n\n')

bidirectional LSTM
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 448)           3441536   
                                                                 
 bidirectional (Bidirection  (None, 25, 640)           1968640   
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 25, 640)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 640)               2460160   
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 640)               0         
                                                                 
 dense_1 (Dense)             (None,

# Entrainement et Validation des modèles 

- Entrainement et validation

In [54]:
# entrainement et validation des modèles
for model in models:
    print(f"Modèle {model['name']}")
    model['model'].fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=64)
    
    # Prédire les labels de x_val
    y_pred = model['model'].predict(x_val)
    
    # Arrondir les valeurs
    y_pred = np.round(y_pred)
    
    # calculons l'accuracy
    model['score'] = accuracy_score(y_pred, y_val)

Modèle bidirectional LSTM
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Modèle LSTM
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Modèle bidirectional GRU
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Modèle GRU
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
# Affichons les scores et choisissons le meilleur modèle
choosed_model = models[0]
for model in models:
    print(f"{model['name']}: {model['score']}")
    if(model['score'] > choosed_model['score']):
        choosed_model = model

bidirectional LSTM: 1.0
LSTM: 1.0
bidirectional GRU: 1.0
GRU: 1.0


In [56]:
# Affichons le modèle choisi
print("Le modèle choisi est :")
print(f"{choosed_model['name']}: {choosed_model['score']}")

Le modèle choisi est :
bidirectional LSTM: 1.0


In [57]:
# sauvegarde du modèle
choosed_model['model'].save("models/french_sentiment_analysis.h5")

  saving_api.save_model(
