# **Rakuten France Classification des données des produits multimodaux**
<br>
<br>



## 4. Modèle d'agrégation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import chi2_contingency
import os
import statsmodels.api
import PIL
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fenzhengrou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




### 4.1. Import des données pre-processing

In [2]:
# import des données pre-processing
df = pd.read_csv('rakuten_data_preproc.csv', index_col= 0)

In [3]:
# charger les resultats de traduction
traduction = pd.read_table("traduction_designation-description.txt", header=None, squeeze=True)
traduction.index = df.index



  traduction = pd.read_table("traduction_designation-description.txt", header=None, squeeze=True)


In [4]:
stop_words = set(stopwords.words('french') + stopwords.words('english'))
mots_vides = ["x", "cm", "mm", "h", "g", "peut", "être", 'e',"l'",'x','p','re', 'li','x','b','d','h', 'pla','br','id','al','ra','pla','sine','r','g','v','u','f']
stop_words.update(mots_vides)


def word_split(text):
    """split text into words, remove non alphabetic tokens and stopwords"""
    
    # suppression de la ponctuation
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(table)
    
    # séparation des textes en listes de mots
    tokens = word_tokenize(text)
    
    # conversion en minuscule
    tokens = [w.lower() for w in tokens]
    
    # restriction aux charactères alphabétiques
    words = [word for word in tokens if word.isalpha()]

    # filtrage des stopwords
    words = [w for w in words if not w in stop_words]
    
    return words

In [5]:
# données textuelles netoyées et traduites
traduction = traduction.apply(word_split)

In [6]:
df['text_data'] = traduction

In [7]:
df.head()

Unnamed: 0_level_0,image,image_size,prdtypecode,designation_description,avec_description,designation_description_nbr_mot,prdcat,text_data
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,image_1263597046_product_3804725264.jpg,14010,10,"['olivia', 'personalisiertes', 'notizbuch', 's...",0,9,Livres,"[olivia, personnalisé, carnet, pages, grille, ..."
1,image_1008141237_product_436067568.jpg,14854,2280,"['journal', 'arts', 'art', 'marche', 'salon', ...",0,20,Livres,"[journal, art, art, marche, salon, art, asiati..."
2,image_938777978_product_201115110.jpg,6898,50,"['grand', 'stylet', 'ergonomique', 'bleu', 'ga...",1,80,Jeux,"[grand, stylet, ergonomique, bleu, manette, je..."
3,image_457047496_product_50418756.jpg,14404,1280,"['peluche', 'donald', 'europe', 'disneyland', ...",0,6,Jouets & figurines,"[peluche, donald, europe, disneyland, marionne..."
4,image_1077757786_product_278535884.jpg,20435,2705,"['guerre', 'tuques', 'luc', 'grandeur', 'veut'...",1,18,Livres,"[guerre, tuques, luc, grandeur, veut, organise..."



### 4.2 Préparation des donnés pour les modèles

 On sélectionne les colonnes utiles pour la modélisation : texte netoyés et traduits mais sans stemmimg, nom des images, et le code type de produit. En suite, on effectue le même traitement comme dans la partie modélisation de texte et d'image.

In [8]:
from sklearn.model_selection import train_test_split

X = df[['text_data', 'image', 'prdtypecode']]
y = df['prdtypecode']

from sklearn.preprocessing import LabelEncoder
encoder =  LabelEncoder()
Y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [9]:
# Tokenization pour les textes
X_train_txt = X_train['text_data'].apply(lambda x : " ".join(w for w in x))
X_test_txt = X_test['text_data'].apply(lambda x : " ".join(w for w in x))

import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train_txt)

word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = len(word2idx)+1 # ajouter 1 ici pour 0 = padding

X_train_txt = tokenizer.texts_to_sequences(X_train_txt)
X_test_txt = tokenizer.texts_to_sequences(X_test_txt)

max_length = 450
X_train_txt = tf.keras.preprocessing.sequence.pad_sequences(X_train_txt, maxlen=max_length, padding='post', truncating='post')
X_test_txt = tf.keras.preprocessing.sequence.pad_sequences(X_test_txt, maxlen=max_length, padding='post', truncating='post')


In [10]:
# Chargement des dimages
X_train_img = X_train[['image', 'prdtypecode']]
X_test_img = X_test[['image', 'prdtypecode']] 
X_train_img['prdtypecode2']=X_train_img['prdtypecode'].astype(str)
X_test_img['prdtypecode2']=X_test_img['prdtypecode'].astype(str)

# generateur des données d'image
import tensorflow as tf

path="../data/images/image_train/"

width = 240
height = 240 # resize 
batch = 32


train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                                                                preprocessing_function = None,
                                                                #rotation_range = 10,
                                                                #width_shift_range = 0.1,
                                                                #height_shift_range = 0.1,
                                                                #zoom_range = 0.1,
                                                                #brightness_range=[0.9, 1.1],
                                                                #horizontal_flip = True
                                                                )

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                                                              preprocessing_function = None
                                                              )



train_set = train_datagen.flow_from_dataframe(dataframe=X_train_img,
                                              directory=path,
                                              x_col = "image",
                                              y_col = 'prdtypecode2',
                                              seed=42,
                                              class_mode="sparse",
                                              target_size = (width, height),
                                              batch_size = batch,
                                              shuffle=False)

test_set = test_datagen.flow_from_dataframe(dataframe=X_test_img,
                                              directory=path,
                                              x_col = "image",
                                              y_col = "prdtypecode2",
                                              class_mode="sparse",
                                              seed=42,
                                              target_size = (width, height),
                                              batch_size = batch,
                                              shuffle=False)

Found 59883 validated image filenames belonging to 27 classes.
Found 14971 validated image filenames belonging to 27 classes.



### 4.3 Chargement des meilleurs modèles 

Ensuite, on charge les modèles de texte et d'image qui donnent le meilleur score dans la phase d'entrainement.

### Load meilleur modèle text : Word2Vec (skip-gram) 

In [11]:
from tensorflow.keras.models import load_model

model_text_W2V = load_model("text_W2V_skip-gram.h5")

model_text_W2V.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 450, 300)          18850800  
                                                                 
 rnn (RNN)                   (None, 450, 128)          165120    
                                                                 
 dropout (Dropout)           (None, 450, 128)          0         
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 256)               33024     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                        

In [12]:
# propabilites des class pour text
train_pred_txt = model_text_W2V.predict(X_train_txt)
test_pred_txt = model_text_W2V.predict(X_test_txt)



In [13]:
train_pred_class_txt = np.argmax(train_pred_txt, axis=1)
test_pred_class_txt = np.argmax(test_pred_txt, axis=1)

train_pred_class_txt = encoder.inverse_transform(train_pred_class_txt)
test_pred_class_txt = encoder.inverse_transform(test_pred_class_txt)

y_train_class = encoder.inverse_transform(y_train)
y_test_class = encoder.inverse_transform(y_test)

print("accuracy score train :", accuracy_score(y_train_class, train_pred_class_txt))
print("accuracy score test :", accuracy_score(y_test_class, test_pred_class_txt))
print(classification_report(y_test_class, test_pred_class_txt))

accuracy score train : 0.8572382813152314
accuracy score test : 0.7942021241066061
              precision    recall  f1-score   support

          10       0.39      0.58      0.47       561
          40       0.56      0.64      0.60       442
          50       0.84      0.85      0.84       286
          60       0.94      0.89      0.91       159
        1140       0.70      0.80      0.75       447
        1160       0.90      0.87      0.88       665
        1180       0.64      0.54      0.59       117
        1280       0.67      0.78      0.72       875
        1281       0.71      0.54      0.61       336
        1300       0.94      0.93      0.94       920
        1301       0.89      0.85      0.87       133
        1302       0.85      0.74      0.79       447
        1320       0.83      0.77      0.80       611
        1560       0.84      0.77      0.80       905
        1920       0.86      0.92      0.89       752
        1940       0.78      0.78      0.78       12

### Load meilleur modèle image : EfficientNetB1 dé-freezé les 20 dernières couches

In [14]:
from tensorflow.keras.models import load_model

model_image_EfficientNetB1 = load_model("image_EfficientNetB1.h5")

model_image_EfficientNetB1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb1 (Functional)  (None, 8, 8, 1280)       6575239   
                                                                 
 sequential_3 (Sequential)   (None, 27)                1856539   
                                                                 
Total params: 8,431,778
Trainable params: 4,186,987
Non-trainable params: 4,244,791
_________________________________________________________________


In [15]:
# probabilites des class pour image
train_pred_img = model_image_EfficientNetB1.predict(train_set)
test_pred_img = model_image_EfficientNetB1.predict(test_set)



In [17]:
fit_labels = dict((v,k) for k,v in (train_set.class_indices).items())

train_pred_class_img = [int(fit_labels[i]) for i in np.argmax(train_pred_img, axis=1)]  
test_pred_class_img = [int(fit_labels[i]) for i in np.argmax(test_pred_img, axis=1)]  

y_train_class = encoder.inverse_transform(y_train)
y_test_class = encoder.inverse_transform(y_test)

print("accuracy score train :", accuracy_score(y_train_class, train_pred_class_img))
print("accuracy score test :", accuracy_score(y_test_class, test_pred_class_img))
print(classification_report(y_test_class, test_pred_class_img))

accuracy score train : 0.7562747357346826
accuracy score test : 0.7571304522076013
              precision    recall  f1-score   support

          10       0.64      0.60      0.62       561
          40       0.63      0.76      0.69       442
          50       0.69      0.58      0.63       286
          60       0.78      0.87      0.82       159
        1140       0.68      0.79      0.73       447
        1160       0.92      0.96      0.94       665
        1180       0.77      0.62      0.68       117
        1280       0.67      0.53      0.59       875
        1281       0.68      0.36      0.47       336
        1300       0.76      0.88      0.82       920
        1301       0.79      0.77      0.78       133
        1302       0.70      0.72      0.71       447
        1320       0.77      0.63      0.70       611
        1560       0.73      0.77      0.75       905
        1920       0.84      0.89      0.86       752
        1940       0.74      0.67      0.70       12



### 4.4. Fusion des modèles texte et image

Selon la recherche bibliographique, il existe plusieurs méthodes pour la construction d'un modèle multi-modal. Nous avons choisi l'approches qui adapte le plus à notre état actuel qui est une fusion au niveau de la décision.

</b>
Dans cette approche, on prend les class de propabilité predictes par les modèles de texte et image respectivement. On les concatene et ensuite réalise une classification finale via des couches de reseau de neurone Dense.

In [18]:
# concatenation des probabilites des modèle texte et image
train_pred_txt_img = np.concatenate([train_pred_txt, train_pred_img], axis=1)
test_pred_txt_img = np.concatenate([test_pred_txt, test_pred_img], axis=1)

In [19]:
test_pred_txt_img.shape

(14971, 54)

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout

model_fusion_proba = Sequential()
model_fusion_proba.add(Input(shape=train_pred_txt_img.shape[1]))
model_fusion_proba.add(Dense(512, activation='relu'))
model_fusion_proba.add(Dense(27, activation='softmax'))

model_fusion_proba.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               28160     
                                                                 
 dense_1 (Dense)             (None, 27)                13851     
                                                                 
Total params: 42,011
Trainable params: 42,011
Non-trainable params: 0
_________________________________________________________________


In [21]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.005, patience=5, mode='max', 
                               restore_best_weights=True, verbose=1)

In [22]:
model_fusion_proba.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_fusion_proba.fit(train_pred_txt_img, y_train, validation_data= [test_pred_txt_img, y_test],
                       batch_size = 32, epochs=20,
                       callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: early stopping


<keras.callbacks.History at 0x2a75a9d15d0>

In [23]:
fusion_test_pred_txt_img = model_fusion_proba.predict(test_pred_txt_img)



In [24]:
fusion_test_pred_txt_img_class = np.argmax(fusion_test_pred_txt_img, axis=1)

fusion_test_pred_txt_img_class = encoder.inverse_transform(fusion_test_pred_txt_img_class)
y_test_class = encoder.inverse_transform(y_test)

print("accuracy score test :", accuracy_score(y_test_class, fusion_test_pred_txt_img_class))
print(classification_report(y_test_class, fusion_test_pred_txt_img_class))


accuracy score test : 0.8613987041613786
              precision    recall  f1-score   support

          10       0.68      0.76      0.72       561
          40       0.81      0.81      0.81       442
          50       0.86      0.86      0.86       286
          60       0.96      0.93      0.95       159
        1140       0.81      0.89      0.85       447
        1160       0.96      0.98      0.97       665
        1180       0.82      0.68      0.74       117
        1280       0.79      0.76      0.77       875
        1281       0.68      0.71      0.69       336
        1300       0.94      0.96      0.95       920
        1301       0.94      0.88      0.91       133
        1302       0.89      0.79      0.84       447
        1320       0.84      0.84      0.84       611
        1560       0.79      0.86      0.83       905
        1920       0.91      0.91      0.91       752
        1940       0.91      0.88      0.90       121
        2060       0.81      0.79      0

### 4.5. Conclusion sur l'agrégation des modèles

Les scores d'accuracy des modèles unimodal et du modèle agrégé sont résumés comme suit:

- Meilleur modèle de texte : **79.4 %**
- Meilleur modèle d'image : **75.7 %**
- Modèle agrégé : **86.1 %**

Le modèle agrégé multi-modal a réussi à **augmenter la performance** de prediction d'environ **7%** par rapport au modèle unimodal de text. 
