In [2]:
import json 
import tensorflow as tf
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from tensorflow.keras import backend as K
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate, BatchNormalization
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Importer les Train et test pour les titres 

In [3]:
data_dir = "data/title_only"
feature = "title"
label = 'label'
x_train = pd.read_csv(f"{data_dir}/train.csv")[feature]
y_train = pd.read_csv(f"{data_dir}/train.csv")[label]

x_test = pd.read_csv(f"{data_dir}/test.csv")[feature]
y_test = pd.read_csv(f"{data_dir}/test.csv")[label]

x_train.shape, y_train.shape, x_test.shape, y_test.shape,

((8000,), (8000,), (2000,), (2000,))

In [4]:
#manipulation de y:catégorie
def create_vocab(dt): 
  to_id = {'<PAD>': 0, '<UNK>':1}

  for sent in dt:
    for w in sent: 
      if w not in to_id.keys():
        to_id[w] = len(to_id)

  from_id = {v: k for k, v in to_id.items()}

  vocab = len(to_id.keys())

  return to_id, from_id, vocab

def preprocess_Y(Y, cat_to_id): 
  res = []
  for ex in Y: 
    if ex not in cat_to_id.keys():
      res.append(cat_to_id['<UNK>'])
    else:
      res.append(cat_to_id[ex])
  return np.array(res)

cat_to_id, cat_from_id, cat_vocab = create_vocab([y_train])
y_train_id = preprocess_Y(y_train, cat_to_id)
y_test_id = preprocess_Y(y_test, cat_to_id)

#Manipulation des titres 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq =  tokenizer.texts_to_sequences(x_test)

x_train_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen = 40, truncating='post')
x_test_seq = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen = 40, truncating='post')

In [4]:
#une fonction pour visualiser accuracy et loss
def plot_history(history, metric="acc"):
    # summarize history for accuracy
    plt.plot(history.history[metric])
    plt.plot(history.history[f'val_{metric}'])
    plt.title(f'model {metric}')
    plt.ylabel(metric)
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [10]:
tokenizer.word_index

{'of': 1,
 'and': 2,
 'in': 3,
 'for': 4,
 'the': 5,
 'a': 6,
 'using': 7,
 'to': 8,
 'with': 9,
 'neural': 10,
 'based': 11,
 'learning': 12,
 'on': 13,
 'analysis': 14,
 'robotic': 15,
 'network': 16,
 'by': 17,
 'networks': 18,
 'an': 19,
 'assisted': 20,
 'artificial': 21,
 'from': 22,
 'robot': 23,
 'prediction': 24,
 'machine': 25,
 'system': 26,
 'cancer': 27,
 'deep': 28,
 'data': 29,
 'laparoscopic': 30,
 'classification': 31,
 'surgery': 32,
 'model': 33,
 'approach': 34,
 'detection': 35,
 'protein': 36,
 'study': 37,
 'prostatectomy': 38,
 'human': 39,
 'fuzzy': 40,
 'radical': 41,
 'gene': 42,
 'computer': 43,
 'patients': 44,
 'identification': 45,
 'disease': 46,
 'automated': 47,
 'support': 48,
 'method': 49,
 'images': 50,
 'application': 51,
 'brain': 52,
 'diagnosis': 53,
 'image': 54,
 'control': 55,
 'novel': 56,
 'predicting': 57,
 'imaging': 58,
 'models': 59,
 'during': 60,
 'clinical': 61,
 'comparison': 62,
 'new': 63,
 'development': 64,
 'expression': 65,
 

In [5]:
# Fonction permettant de charger un embedding 

import numpy as np
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - generate_matrix: whether to generate an embedding matrix
        Outputs:
                - word2coefs: Dictionary. Word to its corresponding coefficients
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    # return word2coefs, word2index, embedding_matrix
    return word2index, np.asarray(embedding_matrix)

[nltk_data] Downloading package punkt to /home/ahlem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Charger les embeddings à l'aide de la fonction load_glove_embeddings

word2index, embedding_matrix = load_glove_embeddings('glove.6B.50d.txt', embedding_dim=50)

In [12]:
word2index['unk']

201534

In [19]:
# ecrire une fonction de tokenization custom pour preprocesser les textes
unknown_words = set()
known_words = set()

def custom_tokenize(doc):
  res = []
  for ex in word_tokenize(doc): 
    if ex not in word2index.keys():
      res.append(word2index['unk'])
      unknown_words.add(ex)
    else:
      res.append(word2index[ex])
      known_words.add(ex)
  return np.array(res)
    
# Encoder les textes avec la fonction custom
X_train_glove = [custom_tokenize(x) for x in x_train]
print(x_train[0])
print(X_train_glove[0])

X_test_glove = [custom_tokenize(x) for x in x_test]
print(X_test_glove[0])


an improved kernel based extreme learning machine for robot execution failures.
[   29  2338 21566   243  3828  2741  2358    10  9247  4415  7693     2]
[ 6707 13867  2233     6   426  2022  2817     2]


In [22]:
print(f"With Glove, there is {len(unknown_words)} unknown words and {len(known_words)} known words.")

With Glove, there is 4592 unknown words and 11324 known words.


In [8]:
# Padding des sequences

X_train_glove = tf.keras.preprocessing.sequence.pad_sequences(X_train_glove, maxlen = 40, truncating='post')
X_test_glove = tf.keras.preprocessing.sequence.pad_sequences(X_test_glove, maxlen = 40, truncating='post')

# CNN
## Modèle basique
## Embedding non préentrainés

In [10]:
embed_dim = 128
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 40))# couche d'embedding de taille 128
model.add(tf.keras.layers.Conv1D(32,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 

model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 128)           1280000   
_________________________________________________________________
conv1d (Conv1D)              (None, 39, 32)            8224      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 128)               4224      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
activation (Activation)      (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 93)                11997     
__________

In [11]:
# Fitter le modèle 

history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 10, validation_data=(x_test_seq, y_test_id))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#visualiser
plot_history(history)

In [12]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[3.485855577468872, 0.3205]

# CNN
## Modèle basique
## Embedding GloVE

In [15]:
# Entrainer un modèle en chargeant les poids des embeddings dans le layer Embedding

#embed_dim = 50

batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True))
model.add(tf.keras.layers.Conv1D(32,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(tf.keras.layers.GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

adam = tf.keras.optimizers.Adam(lr=1e-3)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam, metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 50)            20000050  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 39, 32)            3232      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               4224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
activation_4 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 93)                11997     
__________

In [16]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 10, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.541342189788818, 0.34]

# CNN
## Modèle basique, Changement hyperparamètres
## Embedding non préentrainés

In [22]:
embed_dim = 128
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 40))# couche d'embedding de taille 
model.add(tf.keras.layers.Conv1D(256,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 
adam = tf.keras.optimizers.Adam(lr=1e-4)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam ,metrics = ['accuracy'])
# Fitter le modèle 
history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 50, validation_data=(x_test_seq, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[2.938798152923584, 0.3705]

# CNN
## Modèle basique, Changement hyperparamètres
## Embedding GloVe

In [24]:
#embed_dim = 50
batch_size = 128

model = tf.keras.models.Sequential() # modèle séquentiel
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True))# couche d'embedding de taille 
model.add(tf.keras.layers.Conv1D(256,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1))


model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Activation('relu'))

model.add(tf.keras.layers.Dropout(0.2))

model.add(tf.keras.layers.Dense(cat_vocab))
model.add(tf.keras.layers.Activation('softmax'))

# Compiler le modèle 
adam = tf.keras.optimizers.Adam(lr=1e-4)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=adam ,metrics = ['accuracy'])
# Fitter le modèle 
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 50, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [26]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.3543439178466796, 0.377]

# CNN
## Modèle CNN multi-channel
## Embedding non préentrainés

In [9]:
def get_cnn_model():
    embedding_dim = 300
    filter_sizes = [2, 3, 5]
    num_filters = 256
    drop = 0.3
    MAX_LENGTH=40
    MAX_NB_WORDS = 10000
    
    inputs = Input(shape=(MAX_LENGTH,), dtype='int32')
    embedding = Embedding(input_dim=MAX_NB_WORDS,
                                output_dim=embedding_dim,
                                input_length=MAX_LENGTH)(inputs)

    reshape = Reshape((MAX_LENGTH, embedding_dim, 1))(embedding)
    conv_0 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[0], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    conv_1 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[1], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[2], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[0] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_0)

    maxpool_1 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[1] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_1)

    maxpool_2 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[2] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_2)
    concatenated_tensor = Concatenate(axis=1)(
        [maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    
    dropout = Dropout(drop)(flatten)
    
    #outpout1=Dense(units=256, activation='relu')(dropout)
    #dropout1 = Dropout(drop)(outpout1)
    
    output = Dense(units=cat_vocab, activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=output)
    
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

cnn_model_multi_channel = get_cnn_model()

batch_size = 128
epochs = 45
history = cnn_model_multi_channel.fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [10]:
# Evaluer le modèle
cnn_model_multi_channel.evaluate(x_test_seq, y_test_id)



[2.5053614768981936, 0.403]

In [None]:
plot_history(history)

# CNN
## Modèle CNN multi-channel
## Embedding GloVe

In [9]:
embedding_matrix.shape

(400001, 50)

In [11]:
def get_cnn_model():
    embedding_dim = 50
    filter_sizes = [2, 3, 5]
    num_filters = 256
    drop = 0.3
    MAX_LENGTH=40
    MAX_NB_WORDS = 10000
    
    inputs = Input(shape=(MAX_LENGTH,), dtype='int32')
    embedding = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True)(inputs)

    reshape = Reshape((MAX_LENGTH, embedding_dim, 1))(embedding)
    conv_0 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[0], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    conv_1 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[1], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[2], embedding_dim), 
                    padding='valid', kernel_initializer='normal', 
                    activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[0] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_0)

    maxpool_1 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[1] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_1)

    maxpool_2 = MaxPool2D(pool_size=(MAX_LENGTH - filter_sizes[2] + 1, 1), 
                          strides=(1,1), padding='valid')(conv_2)
    concatenated_tensor = Concatenate(axis=1)(
        [maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    
    dropout = Dropout(drop)(flatten)
    
    #outpout1=Dense(units=256, activation='relu')(dropout)
    #dropout1 = Dropout(drop)(outpout1)
    
    output = Dense(units=cat_vocab, activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=output)
    
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = get_cnn_model()
batch_size = 128

In [12]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 45, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [14]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 45, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [15]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.2661995372772217, 0.402]

# RNN
## Modèle basique: LSTM
## Embedding non préentrainés

In [18]:
# Créer un réseau à base de LSTM avec au minimum:
# Embedding
# Dropout
# LSTM
# Dropout
# Classifieur

embed_dim = 128
lstm_out = 128
batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(10000, embed_dim,input_length = 40))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out)))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(cat_vocab,activation='softmax'))

# Compiler le modèle 
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 128)           1280000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 93)                11997     
Total para

In [19]:
# Fitter le modèle 

history = model.fit(x_train_seq, y_train_id, batch_size = batch_size, epochs = 5, validation_data=(x_test_seq, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Evaluer le modèle
model.evaluate(x_test_seq, y_test_id)



[2.866437623977661, 0.3265]

# RNN
## Modèle basique: LSTM
## Embedding Glove

In [23]:

lstm_out = 128
batch_size = 128

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out)))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(cat_vocab,activation='softmax'))

# Compiler le modèle 
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# Afficher le summary du modèle
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 40, 50)            20000050  
_________________________________________________________________
dropout_9 (Dropout)          (None, 40, 50)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 93)                11997     
Total para

In [24]:
history = model.fit(X_train_glove, y_train_id, batch_size = batch_size, epochs = 18, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


In [25]:
# Evaluer le modèle
model.evaluate(X_test_glove, y_test_id)



[2.340456693649292, 0.399]

# RNN
## Modèle basique: GRU
## Embedding non préentrainés

In [28]:
def get_simple_rnn_model():
    embedding_dim = 300
    MAX_NB_WORDS = 10000
    MAX_LENGTH=40
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    adam = Adam(lr=1e-4)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer= adam,
                  metrics=['accuracy'])
    return model

rnn_simple_model = get_simple_rnn_model()

In [29]:
batch_size = 128
epochs = 150
history = rnn_simple_model .fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150


Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epo

In [30]:
rnn_simple_model.evaluate(x_test_seq, y_test_id)



[2.580115997314453, 0.38]

# RNN
## Modèle basique: GRU
## Embedding Glove

In [31]:
def get_simple_rnn_model():
    embedding_dim = 50
    MAX_NB_WORDS = 10000
    MAX_LENGTH=40
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    adam = Adam(lr=1e-4)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer= adam,
                  metrics=['accuracy'])
    return model

model = get_simple_rnn_model()

In [32]:
history = model.fit(X_train_glove, y_train_id, batch_size = 128, epochs = 150, validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150


Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epo

In [33]:
model.evaluate(X_test_glove, y_test_id)



[2.2582788410186767, 0.395]

# CNN_RNN

## Embedding non préentrainés

In [34]:
def get_rnn_cnn_model():
    embedding_dim = 300
    MAX_NB_WORDS = 10000
    MAX_LENGTH=40
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    inp = Input(shape=(40, ))
    x = Embedding(MAX_NB_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_LENGTH, trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_rnn_cnn_model()
batch_size = 128
epochs = 15
history = model.fit(x=x_train_seq, 
                    y=y_train_id, 
                    batch_size=batch_size, 
                    validation_data=(x_test_seq, y_test_id),
                    epochs=epochs)

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [35]:
model.evaluate(x_test_seq, y_test_id)



[2.803877523422241, 0.349]

# CNN_RNN

## Embedding GloVe

In [36]:
def get_rnn_cnn_model():
    embedding_dim = 50
    MAX_NB_WORDS = 10000
    MAX_LENGTH=40
    
    inp = Input(shape=(40, ))
    x = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],input_length = 40,
                                    weights=[embedding_matrix], 
                              trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(cat_vocab, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = get_rnn_cnn_model()
batch_size = 128
epochs = 15
history = model.fit(X_train_glove,
                    y_train_id, 
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_data=(X_test_glove, y_test_id))

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [37]:
model.evaluate(X_test_glove, y_test_id)



[2.3368661251068117, 0.3885]