# Main workflow:
* Modular implementation.  
* High level programming (layer architecture).  
* Import low level functions from a python script.  

# Import libraries

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2
import FusionModel_tools as fm
import importlib
importlib.reload(fm)

import os


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
## define general parameters

myseed = 123

splitting_path = './Splitted_datasets/'
preprocessing_path = './Preprocessed_data/'
training_path = './Trained_models_and_metrics/'


# Initialize raw dataset
* train test split
* Split entire dataset once here to avoid any issues (information leak, etc)

In [27]:
## import raw datasets: features and target
df_X = pd.read_csv('./datasets/X_train_update.csv', index_col = 0)
df_y = pd.read_csv('./datasets/Y_train_CVw08PX.csv', index_col = 0)


## train-test split raw data
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size = 0.2, \
                                                                random_state = myseed, stratify = df_y)

## merge features and targets
# df_train = pd.concat([df_y_train,df_X_train], axis = 1)
# df_test = pd.concat([df_y_test,df_X_test], axis = 1)


## save splitted dataframes
fm.save(dataframes = [df_X_train, df_X_test, df_y_train, df_y_test], \
             types = ['dataframe', 'dataframe', 'dataframe', 'dataframe'], \
             names = ['df_X_train', 'df_X_test', 'df_y_train', 'df_y_test'], \
              path = splitting_path, doit = True, verbose = True)


## tranforms dataset to feed into the models

Saved dataset: 2308102203_df_X_train.csv
Saved dataset: 2308102203_df_X_test.csv
Saved dataset: 2308102203_df_y_train.csv
Saved dataset: 2308102203_df_y_test.csv


<div class="alert alert-info">
<i class="fa fa-info-circle"></i> &emsp; 
    From now on, the <b>test dataset</b> will only be used to asses the models performance
</div>

# _Text data model_

## Preprocess Text Data


In [28]:

## preprocess datasets: Data cleaning & Feature engineering

df_X_train_preprocess = fm.preprocess_text_data(df_X_train, verbose = True)
df_X_test_preprocess = fm.preprocess_text_data(df_X_test, verbose = True)

fm.save(dataframes = [df_X_train_preprocess, df_X_test_preprocess], \
             types = ['dataframe', 'dataframe'],
             names = ['df_X_train_preprocess', 'df_X_test_preprocess'], \
              path = preprocessing_path, doit = True, verbose = True)



Column 'designation' has been renamed as 'title' 

Columns 'title' and 'description' have been concatenated in a new variable 'title_descr' 

Column 'title_descr' has been successfully HTML parsed and decapitalized.
	 HTML parsing takes 17.68 seconds 

Column 'title_descr' has been successfully tokenized.
	 Tokenization + Lemmatization takes 21.58 seconds 

Main language detection takes 3.74 minutes.
	 Language detection correction takes 2.88 seconds 

Removing stop-words takes 17.43 seconds. 

Token counting takes 0.03 seconds. 

Column 'designation' has been renamed as 'title' 

Columns 'title' and 'description' have been concatenated in a new variable 'title_descr' 

Column 'title_descr' has been successfully HTML parsed and decapitalized.
	 HTML parsing takes 4.24 seconds 

Column 'title_descr' has been successfully tokenized.
	 Tokenization + Lemmatization takes 5.41 seconds 

Main language detection takes 0.97 minutes.
	 Language detection correction takes 2.03 seconds 

Removing

In [31]:
df_X_test_preprocess.head()

Unnamed: 0,title,description,productid,imageid,title_descr,lemma_tokens,language,text_token_len
35666,Spa 5 places Calios - Spalnéa - Acrylique Blan...,Dimensions : 215 x 185 x 93 cm //// Nombre et ...,1926714940,1113061652,spa 5 places calios - spalnéa - acrylique blan...,"[spa, place, calios, spalnéa, acrylique, blanc...",fr,130
19651,New Flame Coupe Humidifier Night Light Mute Bu...,New Flame Coupe Humidifier Night Light Mute Bu...,4220470182,1319055786,new flame coupe humidifier night light mute bu...,"[new, flame, coupe, humidifier, night, light, ...",fr,112
50216,Voice Control Calendar Thermometer Wooden Led ...,Voice ControlCalendar Thermometer Wooden LED D...,4079113221,1287738265,voice control calendar thermometer wooden led ...,"[voice, control, calendar, thermometer, wooden...",en,88
32733,"Enrouleur Télescopique A. PRO ""NEW LINE"" Modèl...",Utilisation de l'Enrouleur Télescopique A. PRO...,228489648,958568141,"enrouleur télescopique a. pro ""new line"" modèl...","[enrouleur, télescopique, pro, new, line, modè...",fr,48
67840,Chaise De Bureau Inclinable Cuir Artificiel Rouge,<p>Cette luxueuse chaise de bureau inclinable ...,3929324475,1265058699,chaise de bureau inclinable cuir artificiel ro...,"[chaise, bureau, inclinable, cuir, artificiel,...",fr,101


### Load preprocessed data
Optional. It helps to free processing memory if restarting the kernel and loading the followinf datasets.

In [3]:
df_X_train_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_X_train_preprocess.csv', header = 0, index_col = 0, sep = ',')
df_X_test_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_X_test_preprocess.csv', header = 0, index_col = 0, sep = ',')
df_y_train = pd.read_csv('./Preprocessed_data/2308102203_df_y_train.csv', header = 0, index_col = 0, sep = ',')
df_y_test = pd.read_csv('./Preprocessed_data/2308102203_df_y_test.csv', header = 0, index_col = 0, sep = ',')

# df_X_train_preprocess.head()

## Data transformation & model initialization

In [4]:
## transform dataset to feed into model

text_data, targets, text_transformer, target_transformer = fm.get_text_data(df_X_train_preprocess, df_X_test_preprocess, df_y_train, df_y_test)


## intialize text model:

text_model = fm.initialize_text_model(model_type  = "NN", \
                             Nb_features = text_data['X_train'].shape[1], \
                             Nb_classes  = targets['y_train'].shape[1])


## Fit text model and save it

In [11]:
## train model
text_model.fit(text_data["X_train"].toarray(), targets["y_train"],
               epochs = 20,
              batch_size = 200,
              validation_split = 0.2)    ## 



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a13e29f310>

In [22]:
## save trained model:

fm.save_model(text_model, name = 'text_model', 
              path = training_path, 
              doit = True)

Model saved as ./Trained_models_and_metrics/2308102259_text_model.keras


In [36]:
## reload trained model:

nn2 = fm.reload_model('2308102259_text_model.keras', 
                       path = training_path, 
                       doit = True)

nn2.summary()

Reloaded model from ./Trained_models_and_metrics/2308102259_text_model.keras
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 115)]             0         
                                                                 
 Dense_1 (Dense)             (None, 512)               59392     
                                                                 
 Dense_2 (Dense)             (None, 27)                13851     
                                                                 
Total params: 73,243
Trainable params: 73,243
Non-trainable params: 0
_________________________________________________________________


# Image data

* preprocess image data: crop, resize, vectorize
* save prpprocessed image_dataset
* transform image data: scale pixels, reshape for CNN
* initialize model (using tranformed data dimensions)
* save trained model
* reload trained model


## preprocess image data
> * crop
> * resize
> * vectorize

In [75]:
# df_X_train_preprocess.head()
# df_X_test_preprocess.head()


# df_image_train_preprocess = fm.preprocess_image_data(df_X_train_preprocess, verbose = True)
df_image_train_preprocess = fm.preprocess_image_data(df_X_train_preprocess, 
                                                     threshold = 230, 
                                                     new_pixel_nb = 100,
                                                     output = 'array',
                                                     verbose = True)


0 images at time 0.00 minutes
1000 images at time 0.13 minutes
2000 images at time 0.25 minutes
3000 images at time 0.38 minutes
4000 images at time 0.50 minutes
5000 images at time 0.63 minutes
10000 images at time 1.25 minutes
15000 images at time 1.87 minutes
20000 images at time 2.49 minutes
25000 images at time 3.12 minutes
30000 images at time 3.74 minutes
35000 images at time 4.36 minutes
40000 images at time 4.99 minutes
45000 images at time 5.61 minutes
50000 images at time 6.24 minutes
55000 images at time 6.87 minutes
60000 images at time 7.49 minutes
65000 images at time 8.11 minutes
Vectorization of 67932 images takes 8.48 minutes


In [76]:
df_image_test_preprocess = fm.preprocess_image_data(df_X_test_preprocess, 
                                                     threshold = 230, 
                                                     new_pixel_nb = 100, 
                                                     output = 'array',
                                                     verbose = True)


0 images at time 0.00 minutes
1000 images at time 0.13 minutes
2000 images at time 0.25 minutes
3000 images at time 0.37 minutes
4000 images at time 0.50 minutes
5000 images at time 0.63 minutes
10000 images at time 1.25 minutes
15000 images at time 1.89 minutes
Vectorization of 16984 images takes 2.14 minutes


In [79]:
fm.save(datasets = [df_image_train_preprocess, df_image_test_preprocess], \
            types = ['array', 'array'], \
             names = ['df_image_train_preprocess', 'df_image_test_preprocess'], \
              path = preprocessing_path, doit = True, verbose = True)

Saved dataset: 2308111631_df_image_train_preprocess.npy
Saved dataset: 2308111631_df_image_test_preprocess.npy


## Load preprocessed data
Optional. It helps to free processing memory if restarting the kernel and loading the followinf datasets.

In [None]:
# df_X_train_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_image_train_preprocess.csv', header = 0, index_col = 0, sep = ',')
# df_X_test_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_image_test_preprocess.csv', header = 0, index_col = 0, sep = ',')


## reload saved numpy array for preprocessed image data
import os

image_train_preprocess = np.load(os.path.join(preprocessing_path, '2308111631_df_image_train_preprocess.npy'))
image_test_preprocess = np.load(os.path.join(preprocessing_path, '2308111631_df_image_test_preprocess.npy'))


## targets are in dataframes
df_y_train = pd.read_csv('./Preprocessed_data/2308102203_df_y_train.csv', header = 0, index_col = 0, sep = ',')
df_y_test = pd.read_csv('./Preprocessed_data/2308102203_df_y_test.csv', header = 0, index_col = 0, sep = ',')


## Transform image data

In [100]:
image_data = fm.get_image_data(image_train_preprocess, image_test_preprocess)


In [105]:
image_data.keys()

dict_keys(['train', 'test'])

## initialize image model

In [103]:
## intialize image model:

image_model = fm.initialize_image_model(model_type  = "CNN", \
                             image_shape = image_data['train'].shape[1:], \
                             Nb_classes  = targets['y_train'].shape[1])


In [104]:
image_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 100, 100, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 96, 96, 32)        2432      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 48, 48, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 48, 48, 32)        0         
                                                                 
 flatten (Flatten)           (None, 73728)             0         
                                                                 
 dense (Dense)               (None, 128)               9437312   
                                                           

## Fit text model and save it

In [109]:
import time
t0 = time.time()

training_history = image_model.fit(image_data["train"], text_data["y_train"],
                             validation_split = 0.2,
                             epochs = 10,
                             batch_size = 200)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [111]:
model_date_time = fm.date_time()
print(model_date_time)

2308111738


In [112]:
## save trained model:

fm.save_model(image_model, name = 'image_model', 
              path = training_path, 
              doit = True)

Model saved as ./Trained_models_and_metrics/2308111739_image_model.keras


In [151]:
## reload trained model:

cnn2 = fm.reload_model('2308111739_image_model.keras', 
                       path = training_path, 
                       doit = True)

cnn2.summary()

Reloaded model from ./Trained_models_and_metrics/2308111739_image_model.keras
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 100, 100, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 96, 96, 32)        2432      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 48, 48, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 48, 48, 32)        0         
                                                                 
 flatten (Flatten)           (None, 73728)             0         
                                                                 
 dense (Dense)               (None, 128)       

# Fusion model

**Get training data ready to feed**  
text_data = get_text_data()  
image_data = get_image_data()  

**Define headless models**  
image_model = initialize_image_model()  
text_model = initialize_text_model()  

headless_image_model = remove_classification_head(image_model)  
headless_text_model = remove_classification_head(text_model)  
headless_text_model.save("...")  
headless_image_model.save("...")  

**define train data for fusion model**  
headless_X_train_image = headless_image_model.predict(image_data["train"])  
headless_X_train_text = headless_text_model.predict(text_data["train"])  

X_train = concatenate(headless_X_train_text, headless_X_train_image)  

**define and train fusion model**  
fusion_model = build_fusion_model()  
fusion_model.fit(X_train)  
fusion_model.save("...")  


**Get text and image data**

In [4]:
## reload preprocessed data

df_X_train_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_X_train_preprocess.csv', header = 0, index_col = 0, sep = ',')
df_X_test_preprocess = pd.read_csv('./Preprocessed_data/2308102210_df_X_test_preprocess.csv', header = 0, index_col = 0, sep = ',')
df_y_train = pd.read_csv('./Preprocessed_data/2308102203_df_y_train.csv', header = 0, index_col = 0, sep = ',')
df_y_test = pd.read_csv('./Preprocessed_data/2308102203_df_y_test.csv', header = 0, index_col = 0, sep = ',')

# df_X_train_preprocess.head()

In [5]:
## get text data
## transform dataset to feed into model

text_data, targets, text_transformer, target_transformer = fm.get_text_data(df_X_train_preprocess, df_X_test_preprocess, df_y_train, df_y_test)


In [8]:
## reload preprocessed image data
image_train_preprocess = np.load(os.path.join(preprocessing_path, '2308111631_df_image_train_preprocess.npy'))
image_test_preprocess = np.load(os.path.join(preprocessing_path, '2308111631_df_image_test_preprocess.npy'))


In [9]:
## get image data
image_data = fm.get_image_data(image_train_preprocess, image_test_preprocess)

**define headless models**

In [10]:
## load text pretrained model:  parent model

text_model = fm.reload_model('2308102259_text_model.keras', 
                       path = training_path, 
                       doit = True)

text_model.summary()

Reloaded model from ./Trained_models_and_metrics/2308102259_text_model.keras
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 115)]             0         
                                                                 
 Dense_1 (Dense)             (None, 512)               59392     
                                                                 
 Dense_2 (Dense)             (None, 27)                13851     
                                                                 
Total params: 73,243
Trainable params: 73,243
Non-trainable params: 0
_________________________________________________________________


In [12]:
## define the headless_model for text

headless_text_model = fm.remove_classification_head(text_model)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 115)]             0         
                                                                 
 Dense_1 (Dense)             (None, 512)               59392     
                                                                 
Total params: 59,392
Trainable params: 59,392
Non-trainable params: 0
_________________________________________________________________


None

In [13]:
## verify that is actually works:

# headless_text_model.summary()
# headless_text_model.predict(text_data['X_test'].toarray())

In [14]:
## load image pretrained model:

image_model = fm.reload_model('2308111739_image_model.keras', 
                       path = training_path, 
                       doit = True)

image_model.summary()

Reloaded model from ./Trained_models_and_metrics/2308111739_image_model.keras
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 100, 100, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 96, 96, 32)        2432      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 48, 48, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 48, 48, 32)        0         
                                                                 
 flatten (Flatten)           (None, 73728)             0         
                                                                 
 dense (Dense)               (None, 128)       

In [15]:
## define headless model for image data:

headless_image_model = fm.remove_classification_head(image_model)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 100, 100, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 96, 96, 32)        2432      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 48, 48, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 48, 48, 32)        0         
                                                                 
 flatten (Flatten)           (None, 73728)             0         
                                                                 
 dense (Dense)               (None, 128)               9437312   
                                                           

None

In [16]:
## verify that is actually works:

# headless_text_model.summary()
# headless_image_model.predict(image_data['test'])

In [17]:
## save both headless models:

fm.save_model(headless_text_model, name = 'headless_text_model', 
              path = training_path, 
              doit = True)

fm.save_model(headless_image_model, name = 'headless_image_model', 
              path = training_path, 
              doit = True)


Model saved as ./Trained_models_and_metrics/2308111859_headless_text_model.keras
Model saved as ./Trained_models_and_metrics/2308111859_headless_image_model.keras


**define train data for fusion model**


In [18]:
headless_X_train_text = headless_text_model.predict(text_data['X_train'].toarray())
headless_X_train_image = headless_image_model.predict(image_data['train'])



In [30]:
## concatenate both datasets
X_train = np.hstack((headless_X_train_text,headless_X_train_image))

## save concatenated array
fm.save(datasets = [X_train], \
            types = ['array'], \
             names = ['concatenated_headless_X_train'], \
              path = preprocessing_path, doit = True, verbose = True)

Saved dataset: 2308111913_concatenated_headless_X_train.npy


In [31]:
print(X_train.shape)

(67932, 640)


**define and train fusion model**  
- fusion_model = build_fusion_model()  
- fusion_model.fit(X_train)  
- fusion_model.save("...")  

In [42]:
params_NN = {'Nb_features' : X_train.shape[1],
             'Nb_classes'  : targets['y_train'].shape[1]}

fusion_model = fm.initialize_fusion_model('NN', params = params_NN)

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 640)]             0         
                                                                 
 dense_1 (Dense)             (None, 128)               82048     
                                                                 
 dense_2 (Dense)             (None, 27)                3483      
                                                                 
Total params: 85,531
Trainable params: 85,531
Non-trainable params: 0
_________________________________________________________________


None

In [48]:
### I don't need to freeze the model since the other layer are not being trained
print(targets['y_train'].shape)

(67932, 27)


In [49]:
fusion_model.fit(X_train, targets['y_train'])



<keras.callbacks.History at 0x20a2d033af0>

In [50]:
## save trained fusion model

fm.save_model(fusion_model, name = 'fusion_model', 
              path = training_path, 
              doit = True)


Model saved as ./Trained_models_and_metrics/2308111936_fusion_model.keras
