# MLA 703. RNN-LSTM et architectures avancées [Analyse de sentiment]

# Importation des modules

In [1]:
# On importe les librairies usuelless
import math
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , Input , SimpleRNN, LSTM , Embedding, Dropout
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D,MaxPooling1D
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import  Adam

import joblib

# On désactive les warnings
import warnings
warnings.filterwarnings('ignore')


# Fonction utile 

In [2]:
def predict(model, x_test):

    # On prédit sur les données de test
    y_hat = model.predict(x_test)

    # On tranforme les prédictions en labels
    i_pos = [i for i in range(len(y_hat)) if y_hat[i]>0]
    i_neg = [i for i in range(len(y_hat)) if y_hat[i]<=0]

    y_pred   = np.zeros(len(y_hat))
    y_pred[i_pos] = 1
    y_pred[i_neg] = 0
    return y_pred

#  Charger les données

In [3]:
# On charge directement la base IMDB par les datasets de Keras
# La méthode load_date possède pas mal d'options et de propriétés intéressantes (voir description) :
# 1) Sépare les jeux d'entrainement et de test
# 2) num_words : Top most frequent words to consider.
# 3) skip_top : Top most frequent words to ignore (they will appear as oov_char value in the sequence data).
# 4) maxlen : Maximum sequence length. Any longer sequence will be truncated.
# 5) seed : Seed for reproducible data shuffling.
# 6) start_char : The start of a sequence will be marked with this character. Set to 1 because 0 is usually the
#    padding character.
# 7) oov_char : words that were cut out because of the num_words or skip_top limit will be replaced with this
#    character.
# 8) index_from : Index actual words with this index and higher.


num_words = 5000
max_len   =  100

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words,
                                                      maxlen=max_len)

# On padde les séquence de mot
x_train     = pad_sequences(x_train, maxlen=max_len, truncating='post')
x_test      = pad_sequences(x_test, maxlen=max_len, truncating='post')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


# Plusieurs couches LSTM.

In [5]:
embed_size = 64                                                 # dimension de l'embedding
RNN_size   = 64

model = Sequential()
model.add(Embedding(num_words, embed_size, mask_zero=True))   
model.add(Bidirectional(LSTM(RNN_size,return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))


# On spécifie la fonction de perte, l'optimiseur, et la fonction d'évaluation
model.compile(loss=BinaryCrossentropy(from_logits=True),
              optimizer=Adam(1e-4),
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 6

In [6]:
history = model.fit(x_train,y_train, epochs=10,
                    validation_data=(x_test,y_test),
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Sauvegarde des résultats

In [13]:
#save weights 
model_filename = "rnn_sentiment.joblib.z"
joblib.dump((model), model_filename)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\bidirectional
......vars
...layers\bidirectional\backward_layer
......vars
...layers\bidirectional\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\forward_layer
......vars
...layers\bidirectional\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional\layer
......vars
...layers\bidirectional\layer\cell
......vars
...layers\bidirectional_1
......vars
...layers\bidirectional_1\backward_layer
......vars
...layers\bidirectional_1\backward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional_1\forward_layer
......vars
...layers\bidirectional_1\forward_layer\cell
......vars
.........0
.........1
.........2
...layers\bidirectional_1\layer
......vars
...layers\bidirectional_1\layer\cell
......vars
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dropout
......vars

['rnn_sentiment.joblib.z']