#  DEEPLEARNING IMAGES & TEXT

In [1]:
import glob
import pandas as pd
import tensorflow as tf


from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np

import os #Miscellaneous operating system interfaces
#https://docs.python.org/3/library/os.html
#get current working directory
path = os.getcwd() + '\\images\\image_train'
path


'C:\\Users\\Edgar\\Documents\\Rakuten\\images\\image_train'

# TEXT

## NON TRAITÉ (RNN_v5)

Données textuelles:

In [2]:
X = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\X_train_update.csv',index_col =0)
y = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\Y_train_CVw08PX.csv',index_col=0).squeeze().map(str)

In [3]:
X.fillna('',inplace=True)
X['text'] = X.apply(lambda line: line['designation'] + line['description'],axis=1)
X.head(3)

Unnamed: 0,designation,description,productid,imageid,text
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,Grand Stylet Ergonomique Bleu Gamepad Nintendo...


Separer les données en train & text:

In [4]:
# Importer la classe train_test 
from sklearn.model_selection import train_test_split

# Séparer le jeu de données en données d'entraînement et données test 
X_train_text, X_test_text, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

X_train_text.head(3)

Unnamed: 0,designation,description,productid,imageid,text
60735,Carte Postale Typo Aimer - Kiub,Carte postale tendance de la collection Typo d...,2825941333,1208783386,Carte Postale Typo Aimer - KiubCarte postale t...
9118,Garçon - Le Jeu De Plateau !,A propos : Il s¿agit d¿un jeu de cartes dans l...,89102802,856119038,Garçon - Le Jeu De Plateau !A propos : Il s¿ag...
55855,Royaume Des Animaux Ab À Asc,,197015072,936925976,Royaume Des Animaux Ab À Asc


Tokeniser: texte -> sequence entier (index dans un dictionaire):

In [5]:
import tensorflow as tf
# Définition du tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000)
# Mettre à jour le dictionnaire du tokenizer
tokenizer.fit_on_texts(X_train_text.text)

• Transformer chaque review X_text_train en une séquence d'entiers à l'aide de la méthode texts_to_sequences

In [6]:
X_train = tokenizer.texts_to_sequences(X_train_text.text)
X_test = tokenizer.texts_to_sequences(X_test_text.text)

Stocker le dictionnaire de correspondance:

In [7]:
# Définition des dictionnaires
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

• Transfomer la liste de sequence X_train en tableau numpy à l'aide de la fonction pad_sequences

In [8]:
maxlen = 500
X_train_txt = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test_txt = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

In [9]:
X_train_txt

array([[  198, 10698, 40528, ...,     0,     0,     0],
       [ 2492,    10,    70, ...,     0,     0,     0],
       [ 3611,    17,   461, ...,     0,     0,     0],
       ...,
       [ 1802,  7702,  1366, ...,     0,     0,     0],
       [  268,  3140,  6942, ...,     0,     0,     0],
       [15137,  5659,   219, ...,     0,     0,     0]])

In [10]:
import json 

with open('classes.json') as f:
    categories = json.load(f)

i = 0
y_dict = {}
for category in categories:
    y_train = y_train.replace(category,categories[category])
    y_test = y_test.replace(category,categories[category])
    y_dict[i] = category
    i+=1

In [11]:
categories

{'10': 0,
 '1140': 1,
 '1160': 2,
 '1180': 3,
 '1280': 4,
 '1281': 5,
 '1300': 6,
 '1301': 7,
 '1302': 8,
 '1320': 9,
 '1560': 10,
 '1920': 11,
 '1940': 12,
 '2060': 13,
 '2220': 14,
 '2280': 15,
 '2403': 16,
 '2462': 17,
 '2522': 18,
 '2582': 19,
 '2583': 20,
 '2585': 21,
 '2705': 22,
 '2905': 23,
 '40': 24,
 '50': 25,
 '60': 26}

In [12]:
y_train[:10]

60735     9
9118      5
55855    16
42138     8
10948     0
10697     9
64862    22
19280     6
61325    10
47489     4
Name: prdtypecode, dtype: int64

### Dataset

In [13]:
# Dataset de notre jeu de données
text_train_set = tf.data.Dataset.from_tensor_slices((X_train_txt, y_train.values))

text_test_set = tf.data.Dataset.from_tensor_slices((X_test_txt, y_test.values))

In [14]:
# Ajouter la fonction load_image dans le pipeline des opérations. Séparer le résultat en lot de taille 32.
text_train_set = text_train_set.map(lambda text, y: [text, y]).batch(32).repeat(-1)
#dataset = dataset.map(lambda x, y: [load_image(x), y[:-1], y[1:]]).batch(16).repeat(-1)

text_test_set = text_test_set.map(lambda text, y: [text, y]).batch(32).repeat(-1)

### Modele pour classification de texte

In [15]:
model_type = 'RNN'
version = 'v5'
model_name = model_type + '_' +  version
model_path = 'models_output\\' + model_type + '\\' + version + '\\'
model_path_rnn_v5 = model_path

In [16]:
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, RNN, GRU, Dense,Dropout

embedding_dim = 256
voc_size_inp = len(tokenizer.word_counts)+1

text_inputs = Input(shape=(maxlen,), dtype='int32',name='input_' + model_name)
x = Embedding(voc_size_inp, embedding_dim, name= 'embed_' + model_name)(text_inputs)
#x = GRU(128, return_sequences=True, name='gru_' + model_name)(x)
x = GRU(128,return_sequences=True, name = 'GRU_' + model_name)(x)
#x = Dense(1024, activation='relu', name='dense_1_' + model_name)(x)
x = Dropout(0.3, name='dropout_1' + model_name)(x)
x = GlobalAveragePooling1D(name='batchnorm' + model_name)(x)
RNN_v5 = Dense(256, activation='relu', name='dense_2_' + model_name)(x)
#x = Dropout(0.3, name='dropout_2_' + model_name)(x)
#text_output = Dense(27, activation='softmax', name='output_' + model_name)(x)

## TEXT TRAITÉ (RNN_v6)

Données textuelles:

In [17]:
#X = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\X_train_update.csv',index_col =0)
X_treat = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\X_train\X_train_lemma-FR_stop_words-FR_no_num-FR_remove_accents-FR_no_special-FR_lemma-EN_stop_words-EN_stop_words-DE_lemma-DE_steem-FR_steem-EN_steem-DE.csv',index_col =0)
y = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\Y_train_CVw08PX.csv',index_col=0).squeeze().map(str)

Nombre de mots par texte:

In [18]:
X_treat.rename(columns={'0':'text'},inplace=True)

Separer les données en train & text:

In [19]:
# Importer la classe train_test 
from sklearn.model_selection import train_test_split

# Séparer le jeu de données en données d'entraînement et données test 
X_train_text_treated, X_test_text_treated, y_train, y_test = train_test_split(X_treat,y, test_size=0.2,random_state=42)

X_train_text.head(3)

Unnamed: 0,designation,description,productid,imageid,text
60735,Carte Postale Typo Aimer - Kiub,Carte postale tendance de la collection Typo d...,2825941333,1208783386,Carte Postale Typo Aimer - KiubCarte postale t...
9118,Garçon - Le Jeu De Plateau !,A propos : Il s¿agit d¿un jeu de cartes dans l...,89102802,856119038,Garçon - Le Jeu De Plateau !A propos : Il s¿ag...
55855,Royaume Des Animaux Ab À Asc,,197015072,936925976,Royaume Des Animaux Ab À Asc


Tokeniser: texte -> sequence entier (index dans un dictionaire):

In [20]:
import tensorflow as tf
# Définition du tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000)
# Mettre à jour le dictionnaire du tokenizer
tokenizer.fit_on_texts(X_train_text_treated.text)

• Transformer chaque review X_text_train en une séquence d'entiers à l'aide de la méthode texts_to_sequences

In [21]:
X_train_treated = tokenizer.texts_to_sequences(X_train_text_treated.text)
X_test_treated = tokenizer.texts_to_sequences(X_test_text_treated.text)

Stocker le dictionnaire de correspondance:

In [22]:
# Définition des dictionnaires
word2idx_treated = tokenizer.word_index
idx2word_treated = tokenizer.index_word
vocab_size_treated = tokenizer.num_words

• Transfomer la liste de sequence X_train en tableau numpy à l'aide de la fonction pad_sequences

In [23]:
maxlen = 500
X_train_txt_treated = tf.keras.preprocessing.sequence.pad_sequences(X_train_treated, maxlen=maxlen, padding='post')
X_test_txt_treated = tf.keras.preprocessing.sequence.pad_sequences(X_test_treated, maxlen=maxlen, padding='post')

In [24]:
X_train_txt_treated

array([[  112,  2600, 20822, ...,     0,     0,     0],
       [  679,    31,   754, ...,     0,     0,     0],
       [ 2721,   195,   410, ...,     0,     0,     0],
       ...,
       [ 1335,  4387,    63, ...,     0,     0,     0],
       [  208,   736,  4951, ...,     0,     0,     0],
       [ 2665,  1985,   143, ...,     0,     0,     0]])

In [25]:
import json 

with open('classes.json') as f:
    categories = json.load(f)

i = 0
y_dict = {}
for category in categories:
    y_train = y_train.replace(category,categories[category])
    y_test = y_test.replace(category,categories[category])
    y_dict[i] = category
    i+=1

In [26]:
y_train

60735     9
9118      5
55855    16
42138     8
10948     0
         ..
6265     12
54886    13
76820    11
860       8
15795     0
Name: prdtypecode, Length: 67932, dtype: int64

### Dataset

In [27]:
# Dataset de notre jeu de données
treated_text_train_set = tf.data.Dataset.from_tensor_slices((X_train_txt_treated, y_train.values))

treated_text_test_set = tf.data.Dataset.from_tensor_slices((X_test_txt_treated, y_test.values))

In [28]:
# Ajouter la fonction load_image dans le pipeline des opérations. Séparer le résultat en lot de taille 32.
treated_text_train_set = treated_text_train_set.map(lambda text, y: [text, y]).batch(32).repeat(-1)
#dataset = dataset.map(lambda x, y: [load_image(x), y[:-1], y[1:]]).batch(16).repeat(-1)

treated_text_test_set = treated_text_test_set.map(lambda text, y: [text, y]).batch(32).repeat(-1)

### Modele pour classification de texte

In [29]:
model_type = 'RNN'
version = 'v6'
model_name = model_type + '_' +  version
model_path = 'models_output\\' + model_type + '\\' + version + '\\'
model_path_rnn_v6 = model_path

In [30]:
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, RNN, GRU, Dense,Dropout

embedding_dim = 256
voc_size_inp = len(tokenizer.word_counts)+1

treated_text_inputs = Input(shape=(maxlen,), dtype='int32',name='input_' + model_name)
x = Embedding(voc_size_inp, embedding_dim, name= 'embed_' + model_name)(treated_text_inputs)
#x = GRU(128, return_sequences=True, name='gru_' + model_name)(x)
x = GRU(128,return_sequences=True, name = 'GRU_' + model_name)(x)
#x = Dense(1024, activation='relu', name='dense_1_' + model_name)(x)
x = Dropout(0.3, name='dropout_1' + model_name)(x)
x = GlobalAveragePooling1D(name='batchnorm' + model_name)(x)
RNN_v6 = Dense(256, activation='relu', name='dense_2_' + model_name)(x)
#x = Dropout(0.3, name='dropout_2_' + model_name)(x)
#text_output = Dense(27, activation='softmax', name='output_' + model_name)(x)

## IMAGES (EffNetB1)

Recuperer les données images:

In [31]:
X = pd.read_csv('X_train_update.csv',index_col=0)
y = pd.read_csv('Y_train_CVw08PX.csv',index_col=0).squeeze().map(str)

#Create a column with the name of the picture
X['image_name'] = 'image_' + X['imageid'].map(str) + '_product_' + X['productid'].map(str) + '.jpg'
X['image_path'] = path + r'\image_' + X['imageid'].map(str) + '_product_' + X['productid'].map(str) + '.jpg'
print(X['image_name'].loc[0])
print(X['image_path'].loc[0])

image_1263597046_product_3804725264.jpg
C:\Users\Edgar\Documents\Rakuten\images\image_train\image_1263597046_product_3804725264.jpg


Concatener X_train et les labels:

In [32]:
X = pd.concat([X,y],axis=1)
X.head(3)

Unnamed: 0,designation,description,productid,imageid,image_name,image_path,prdtypecode
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,image_1263597046_product_3804725264.jpg,C:\Users\Edgar\Documents\Rakuten\images\image_...,10
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,image_1008141237_product_436067568.jpg,C:\Users\Edgar\Documents\Rakuten\images\image_...,2280
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,image_938777978_product_201115110.jpg,C:\Users\Edgar\Documents\Rakuten\images\image_...,50


Une fois nous avons un dataset de la taille desirée on peut le séparer en train et test sets:

In [33]:
X_train_img, X_test_img, y_train, y_test = train_test_split(X[['image_name','prdtypecode']], X.prdtypecode ,test_size=0.2, random_state=42)
#X_train_path, X_test_path, y_train, y_test = train_test_split(X.image_path, X.label, random_state=42)

In [34]:
X_train_img.head(3)

Unnamed: 0,image_name,prdtypecode
60735,image_1208783386_product_2825941333.jpg,1320
9118,image_856119038_product_89102802.jpg,1281
55855,image_936925976_product_197015072.jpg,2403


Transformation des données:

In [35]:
#APPLY SOME TRANSFORMATIONS TO DATA
#from keras.applications.vgg16 import preprocess_input
from keras.applications.efficientnet import preprocess_input

batch = 32


train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(#rescale = 1./255,
                                preprocessing_function = preprocess_input,
                                   #shear_range = 0.5,
                                   #zoom_range = 0.1,
                                   #rotation_range=10,
                                   #width_shift_range=0.1,
                                   #height_shift_range=0.1,
                                   #horizontal_flip=True,
                                   # brightness_range = [0.9,1.1],
                                   #fill_mode='nearest'
                                )

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(#rescale = 1./255,
                                                              preprocessing_function = preprocess_input
                                                              )

image_train_set = train_datagen.flow_from_dataframe(dataframe=X_train_img,
                                              directory=path,
                                              x_col = "image_name",
                                              y_col = "prdtypecode",
                                              class_mode ="sparse",
                                              target_size = (224, 224),
                                              batch_size = batch,
                                                   shuffle=False)

image_test_set = test_datagen.flow_from_dataframe(dataframe=X_test_img,
                                              directory=path,
                                              x_col = "image_name",
                                              y_col = "prdtypecode",
                                            class_mode ="sparse",
                                              target_size = (224, 224),
                                              batch_size = batch,
                                           shuffle=False)

Found 67932 validated image filenames belonging to 27 classes.
Found 16984 validated image filenames belonging to 27 classes.


In [36]:
image_test_set.class_indices

{'10': 0,
 '1140': 1,
 '1160': 2,
 '1180': 3,
 '1280': 4,
 '1281': 5,
 '1300': 6,
 '1301': 7,
 '1302': 8,
 '1320': 9,
 '1560': 10,
 '1920': 11,
 '1940': 12,
 '2060': 13,
 '2220': 14,
 '2280': 15,
 '2403': 16,
 '2462': 17,
 '2522': 18,
 '2582': 19,
 '2583': 20,
 '2585': 21,
 '2705': 22,
 '2905': 23,
 '40': 24,
 '50': 25,
 '60': 26}

### Modele pour classification d'images

In [37]:
model_type = 'Functional'
existing_model = 'EffNetB1'
version = 'v2'
filename = existing_model + '_' +  version
model_path = 'models_output\\' + existing_model + '\\' + version + '\\'
model_path_effnetb1_v2 = model_path

In [38]:
#from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Dense, Flatten, Dropout, Input, GlobalAveragePooling2D
from tensorflow.keras import Model

base_model = tf.keras.applications.EfficientNetB1(weights='imagenet',input_shape=(224, 224, 3),include_top=False)

image_input = Input(shape=(224, 224, 3), name= 'input_' + filename)
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu',name= 'dense_' + filename)(x)
#x = BatchNormalization(trainable = True,axis=1,name= 'batchnorm' + model_name)(x)
x = Dropout(0.5,name= 'dropout_' + filename)(x)
x = Dense(512, activation='relu',name= 'dense_2_' + filename)(x)
EffNetB1 = Dropout(0.2,name= 'dropout_2_' + filename)(x)
#x = Flatten()(x) 

## Generateurs

Generation des données:

In [39]:
# Définition d'un générateur python
def generator(image_set,text_set,treated_text_set):
    iter_image = iter(image_set)
    iter_text = iter(text_set)
    iter_text_treated = iter(treated_text_set)
    while True:
        X_im, y = next(iter_image)
        X_text, y_text = next(iter_text) 
        X_text_treated, y_text = next(iter_text_treated) 
        
        #print('y_text', y_text)
        #print('')
        #print('y_image', y)
        #print('')
        
        
        #print('')
        #print('X_im:',X_im.shape,'X_text:',X_text.shape,'y:',y.shape)
        #print(y)
        yield [X_im, X_text, X_text_treated], y_text

In [40]:
# Définition du générateur final.
gen_train = generator(image_train_set,text_train_set,treated_text_train_set)

gen_test = generator(image_test_set,text_test_set,treated_text_test_set)

### CONCATENATE

In [41]:
#https://towardsdatascience.com/deep-multi-input-models-transfer-learning-for-image-and-word-tag-recognition-7ae0462253dc

from tensorflow.keras.layers import concatenate
from tensorflow.keras import Model

x = concatenate([EffNetB1, RNN_v5, RNN_v6], axis=-1)
x = Dense(1024, activation='relu',name='dense_' + 'concat')(x)
x = Dropout(0.3,name= 'dropout_' + 'concat')(x)
output = Dense(27, activation='softmax',name='dense_' + 'concat_final')(x)

model = Model([image_input, text_inputs, treated_text_inputs], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_EffNetB1_v2 (InputLayer)  [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
efficientnetb1 (Functional)     (None, 7, 7, 1280)   6575239     input_EffNetB1_v2[0][0]          
__________________________________________________________________________________________________
input_RNN_v5 (InputLayer)       [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_RNN_v6 (InputLayer)       [(None, 500)]        0                                            
______________________________________________________________________________________________

Charger les poids des modèles déjà entrainés:

In [42]:
def copy_old_models_weights(concat_model,model_name,model_path):
    """
    Copie des poids des layers des sous modèles de bases vers leurs clones
    (ayant le même nom) du modèle courant
    """
    
    for model_name, path in zip(model_name,model_path):
        old_model = tf.keras.models.load_model(path + model_name + '.hdf5')
        old_model.load_weights(path + model_name + '.hdf5')

        for old_layer in old_model.layers:
            weights = old_layer.get_weights()
            if len(weights) > 0:
                for new_layer in concat_model.layers:
                    if new_layer.name == old_layer.name:
                        print(f"      - {new_layer.name}")
                        new_layer.set_weights(weights)

In [43]:
models_name = ['RNN_v5','RNN_v6','EffNetB1_v2']
models_path = [model_path_rnn_v5,model_path_rnn_v6,model_path_effnetb1_v2]

copy_old_models_weights(model,models_name,models_path)

      - embed_RNN_v5
      - GRU_RNN_v5
      - dense_2_RNN_v5
      - embed_RNN_v6
      - GRU_RNN_v6
      - dense_2_RNN_v6
      - efficientnetb1
      - dense_EffNetB1_v2
      - dense_2_EffNetB1_v2


Fixer les layers déjà entrainés et débloquer que celles du concat:

In [44]:
for layer in model.layers: 
    layer.trainable = False

print('unfreeze the following layers:')
for layer in model.layers:
    if '_concat' in layer.name:
        print(layer.name)
        layer.trainable = True
        
model.summary()

unfreeze the following layers:
dense_concat
dropout_concat
dense_concat_final
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_EffNetB1_v2 (InputLayer)  [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
efficientnetb1 (Functional)     (None, 7, 7, 1280)   6575239     input_EffNetB1_v2[0][0]          
__________________________________________________________________________________________________
input_RNN_v5 (InputLayer)       [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_RNN_v6 (InputLayer)       [(None, 500)]        0                                            
________________

Model compile:

In [45]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

Callbacks:

In [46]:
existing_model = 'Concat'
version = 'v1'
model_path = 'models_output\\' + existing_model + '\\' + version + '\\'
filename = 'RNN_V5' + '_' + 'RNN_V6' + '_' + 'EffNetb1_v2'

In [47]:
from tensorflow.keras import callbacks

early_stopping = callbacks.EarlyStopping(monitor='val_accuracy',
                                         patience=3,
                                         mode='max',
                                         restore_best_weights=True)


checkpoint = callbacks.ModelCheckpoint(filepath= model_path + filename + '.hdf5', 
                                       monitor='val_accuracy',
                                       save_best_only=True,
                                       save_weights_only=False,
                                       mode='max',
                                       save_freq='epoch')

red_on_plateau = callbacks.ReduceLROnPlateau(monitor='val_accuracy', 
                                             patience=2, 
                                             factor=0.1,
                                             verbose=1)

Entrainement du modèle:

In [48]:
train_steps = int(len(y_train.values)/32)
validation_steps = int(len(y_test.values)/32)

model.fit(gen_train,
          steps_per_epoch = train_steps,
          validation_data = gen_test,
          validation_steps = validation_steps,
          epochs=10,
          workers=1,
         callbacks=[early_stopping, checkpoint,red_on_plateau])

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x24a208653a0>