In [1]:
import json 
import tensorflow as tf
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from tensorflow.keras import backend as K
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate, BatchNormalization
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Importer les Train et test pour les abstract 

In [2]:
data_dir = "data/abstract_only"
feature = "abstract"
label = 'label'
x_train = pd.read_csv(f"{data_dir}/train.csv")[feature]
y_train = pd.read_csv(f"{data_dir}/train.csv")[label]

x_test = pd.read_csv(f"{data_dir}/test.csv")[feature]
y_test = pd.read_csv(f"{data_dir}/test.csv")[label]

x_train.shape, y_train.shape, x_test.shape, y_test.shape,

((8000,), (8000,), (2000,), (2000,))

In [3]:
#manipulation de y:catégorie: label
def create_vocab(dt): 
  to_id = {'<PAD>': 0, '<UNK>':1}

  for sent in dt:
    for w in sent: 
      if w not in to_id.keys():
        to_id[w] = len(to_id)

  from_id = {v: k for k, v in to_id.items()}

  vocab = len(to_id.keys())

  return to_id, from_id, vocab

def preprocess_Y(Y, cat_to_id): 
  res = []
  for ex in Y: 
    if ex not in cat_to_id.keys():
      res.append(cat_to_id['<UNK>'])
    else:
      res.append(cat_to_id[ex])
  return np.array(res)

cat_to_id, cat_from_id, cat_vocab = create_vocab([y_train])
y_train_id = preprocess_Y(y_train, cat_to_id)
y_test_id = preprocess_Y(y_test, cat_to_id)

#Manipulation des titres 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq =  tokenizer.texts_to_sequences(x_test)

x_train_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen = 888, truncating='post')
x_test_seq = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen = 888, truncating='post')

In [4]:
#une fonction pour visualiser accuracy et loss
def plot_history(history, metric="acc"):
    # summarize history for accuracy
    plt.plot(history.history[metric])
    plt.plot(history.history[f'val_{metric}'])
    plt.title(f'model {metric}')
    plt.ylabel(metric)
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [5]:
# Fonction permettant de charger un embedding 

import numpy as np
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - generate_matrix: whether to generate an embedding matrix
        Outputs:
                - word2coefs: Dictionary. Word to its corresponding coefficients
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    # return word2coefs, word2index, embedding_matrix
    return word2index, np.asarray(embedding_matrix)

[nltk_data] Downloading package punkt to /home/ahlem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Charger les embeddings à l'aide de la fonction load_glove_embeddings

word2index, embedding_matrix = load_glove_embeddings('glove.6B.50d.txt', embedding_dim=50)

In [7]:
# ecrire une fonction de tokenization custom pour preprocesser les textes

def custom_tokenize(doc):
  res = []
  for ex in word_tokenize(doc): 
    if ex not in word2index.keys():
      res.append(word2index['unk'])
    else:
      res.append(word2index[ex])
  return np.array(res)
    
# Encoder les textes avec la fonction custom
X_train_glove = [custom_tokenize(x) for x in x_train]
print(X_train_glove[0])

X_test_glove = [custom_tokenize(x) for x in x_test]
print(X_test_glove[0])


[  9247   4415   7693  11231     23   7215     24      6      0   9247
   6723     14      7   1013   2741    747    445      4   5998  26431
     46  10645  12168      3    933      5  23925  11231   4276     10
     37  11231    747     17    333   2741   5306      2   2317      1
    197      4   6326      0   9247   4415   7693    747     17    333
     23  10645     24     46  21699    933   8931     56    999      6
      0   9247    610      2     10   3568      0  11231   9009      3
   9247   4415   7693      1     37   1307  10388      7   1999 226671
   2741  12597    622      0  12886  24113  26479   1587      4  33062
      0  12838      3  21566   4341      3  19847   2817      1     42
     14    175      0 201534   2741  12597      2      0  14575    604
     17      0   9247   4415   7693  75816    273     12      1     21
  54193      0  21566  12838      1      0   1225  12597     31    219
  35743    883      5 112878 226671      5      0     68   6223      6
   107

In [8]:
# Padding des sequences

X_train_glove = tf.keras.preprocessing.sequence.pad_sequences(X_train_glove, maxlen = 888, truncating='post')
X_test_glove = tf.keras.preprocessing.sequence.pad_sequences(X_test_glove, maxlen = 888, truncating='post')

# CNN
## Modèle basique
## Embedding non préentrainés

In [9]:
embed_dim = 128
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 888))# couche d'embedding de taille 128
model.add(tf.keras.layers.Conv1D(32,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 

model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 888, 128)          1280000   
_________________________________________________________________
conv1d (Conv1D)              (None, 887, 32)           8224      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 128)               4224      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
activation (Activation)      (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 93)                11997     
__________

In [11]:
# Fitter le modèle 

history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 10, validation_data=(x_test_seq, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#visualiser
plot_history(history)

In [12]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[2.65889536857605, 0.38]

# CNN
## Modèle basique
## Embedding GloVE

In [13]:
# Entrainer un modèle en chargeant les poids des embeddings dans le layer Embedding

#embed_dim = 50

batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True))
model.add(tf.keras.layers.Conv1D(32,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(tf.keras.layers.GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

adam = tf.keras.optimizers.Adam(lr=1e-3)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam, metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 888, 50)           20000050  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 887, 32)           3232      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               4224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 93)                11997     
__________

In [16]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 15, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)

# CNN
## Modèle basique, Changement hyperparamètres
## Embedding non préentrainés

In [17]:
embed_dim = 128
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 888))# couche d'embedding de taille 
model.add(tf.keras.layers.Conv1D(256,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 
adam = tf.keras.optimizers.Adam(lr=1e-4)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam ,metrics = ['accuracy'])
# Fitter le modèle 
history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 50, validation_data=(x_test_seq, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[2.2557374572753908, 0.437]

# CNN
## Modèle basique, Changement hyperparamètres
## Embedding GloVe

In [20]:
#embed_dim = 50
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True))# couche d'embedding de taille 
model.add(tf.keras.layers.Conv1D(256,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))


model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 
adam = tf.keras.optimizers.Adam(lr=1e-4)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam ,metrics = ['accuracy'])
# Fitter le modèle 
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 100, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70


Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [26]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.0607304592132567, 0.448]

# CNN
## Modèle CNN multi-channel
## Embedding non préentrainés

In [28]:
def get_cnn_model():
    embedding_dim = 300
    filter_sizes = [2, 3, 5]
    num_filters = 256
    drop = 0.3
    MAX_LENGTH=888
    MAX_NB_WORDS = 10000
    
    inputs = Input(shape=(MAX_LENGTH,), dtype='int32')
    embedding = Embedding(input_dim=MAX_NB_WORDS,
                                output_dim=embedding_dim,
                                input_length=MAX_LENGTH)(inputs)

    reshape = Reshape((MAX_LENGTH, embedding_dim, 1))(embedding)
    conv_0 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[0], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    conv_1 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[1], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[2], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[0] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_0)

    maxpool_1 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[1] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_1)

    maxpool_2 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[2] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_2)
    concatenated_tensor = Concatenate(axis=1)(
        [maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    
    dropout = Dropout(drop)(flatten)
    
    #outpout1=Dense(units=256, activation='relu')(dropout)
    #dropout1 = Dropout(drop)(outpout1)
    
    output = Dense(units=cat_vocab, activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=output)
    
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

cnn_model_multi_channel = get_cnn_model()

batch_size = 128
epochs = 100
history = cnn_model_multi_channel.fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [29]:
# Evaluer le modèle
cnn_model_multi_channel.evaluate(x_test_seq, y_test_id)



[1.9862929067611694, 0.502]

In [None]:
plot_history(history)

# CNN
## Modèle CNN multi-channel
## Embedding GloVe

In [31]:
def get_cnn_model():
    embedding_dim = 50
    filter_sizes = [2, 3, 5]
    num_filters = 256
    drop = 0.3
    MAX_LENGTH=888
    MAX_NB_WORDS = 10000
    
    inputs = Input(shape=(MAX_LENGTH,), dtype='int32')
    embedding = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True)(inputs)

    reshape = Reshape((MAX_LENGTH, embedding_dim, 1))(embedding)
    conv_0 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[0], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    conv_1 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[1], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[2], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[0] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_0)

    maxpool_1 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[1] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_1)

    maxpool_2 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[2] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_2)
    concatenated_tensor = Concatenate(axis=1)(
        [maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    
    dropout = Dropout(drop)(flatten)
    
    #outpout1=Dense(units=256, activation='relu')(dropout)
    #dropout1 = Dropout(drop)(outpout1)
    
    output = Dense(units=cat_vocab, activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=output)
    
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = get_cnn_model()
batch_size = 128

In [32]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 100, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [33]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 20, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 100, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100

KeyboardInterrupt: 

In [None]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)

# RNN
## Modèle basique: LSTM
## Embedding non préentrainés

In [40]:
# Créer un réseau à base de LSTM avec au minimum:
# Embedding
# Dropout
# LSTM
# Dropout
# Classifieur

embed_dim = 128
lstm_out = 128
batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 888))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out)))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(cat_vocab,activation='softmax'))

# Compiler le modèle 
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 888, 128)          1280000   
_________________________________________________________________
dropout_14 (Dropout)         (None, 888, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_15 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 93)                11997     
Total para

In [41]:
# Fitter le modèle 

history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 10, validation_data=(x_test_seq, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[3.5078332405090333, 0.2715]

# RNN
## Modèle basique: LSTM
## Embedding Glove

In [43]:

lstm_out = 128
batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out)))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(cat_vocab,activation='softmax'))

# Compiler le modèle 
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 888, 50)           20000050  
_________________________________________________________________
dropout_17 (Dropout)         (None, 888, 50)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_18 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_19 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 93)                11997     
Total para

In [44]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 10, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.7848419647216796, 0.2975]

# RNN
## Modèle basique: GRU
## Embedding non préentrainés

In [55]:
def get_simple_rnn_model():
    embedding_dim = 300
    MAX_NB_WORDS = 10000
    MAX_LENGTH=888
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    adam = Adam(lr=1e-3)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer= adam,
                  metrics=['accuracy'])
    return model

rnn_simple_model = get_simple_rnn_model()

In [56]:
batch_size = 128
epochs = 50
history = rnn_simple_model .fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [57]:
rnn_simple_model.evaluate(x_test_seq, y_test_id)



[3.0843389625549316, 0.405]

# RNN
## Modèle basique: GRU
## Embedding Glove

In [47]:
def get_simple_rnn_model():
    embedding_dim = 50
    MAX_NB_WORDS = 10000
    MAX_LENGTH=888
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    adam = Adam(lr=1e-3)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer= adam,
                  metrics=['accuracy'])
    return model

model = get_simple_rnn_model()

In [48]:
history = model.fit(X_train_glove, y_train_id, batch_size = 128, epochs = 50, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50

KeyboardInterrupt: 

In [49]:
model.evaluate(X_test_glove, y_test_id)



[2.0637245044708252, 0.461]

# CNN_RNN

## Embedding non préentrainés

In [51]:
def get_rnn_cnn_model():
    embedding_dim = 300
    MAX_NB_WORDS = 10000
    MAX_LENGTH=888
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    inp = Input(shape=(888, ))
    x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_rnn_cnn_model()
batch_size = 128
epochs = 15
history = model.fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [52]:
model.evaluate(x_test_seq, y_test_id)



[2.476172409057617, 0.3985]

# CNN_RNN

## Embedding GloVe

In [53]:
def get_rnn_cnn_model():
    embedding_dim = 50
    MAX_NB_WORDS = 10000
    MAX_LENGTH=888
    
    inp = Input(shape=(888, ))
    x = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 888,
                                    weights=[embedding_matrix], 
                              trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_rnn_cnn_model()
batch_size = 128
epochs = 15
history = model.fit(X_train_glove,
                    y_train_id, 
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [54]:
model.evaluate(X_test_glove, y_test_id)



[2.094347032546997, 0.43]