# 0 - Information

In [1]:
# Author: Pierre Oreistein
# Last edit: 06/06/2021

# 1 - Packages

## 1.1 - Mainstream Packages

In [2]:
# Maths packages
import numpy as np

# Data Handling Packages
import pandas as pd

# Machine Learning Packages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# NLP Packages
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import Dataset

# Deep learning packages
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy, CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# Utils
import pprint
from colorama import Fore, Back, Style
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Static Type Checking
from typing import List, Union

## 1.2 - Personnal Utils

In [3]:
 class COLOR:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'
    
def display_check(boolean: bool) -> str:
    """
    Display the boolean with color and bold for easier understanding.
    """
    # If the boolean is True
    if boolean:
        return COLOR.BOLD + Back.GREEN + "True" + Style.RESET_ALL
    else:
        return COLOR.BOLD + Back.RED + "False" + Style.RESET_ALL
    
def display_title(title: str) -> None:
    """Display title"""
    print(f"----- {COLOR.BOLD} {title} {Style.RESET_ALL} -----")
    
def display_dataframe(df: pd.DataFrame) -> None:
    """Pretty display inside Jupyter Notebook of DataFrames."""
    display(HTML(df.to_html()))

# 2 - Load Data and Global Parameters

## 2.1 - Load Data

In [4]:
# Load the data
train_raw_df = pd.read_csv("./Data/01_Raw_Data/train.csv")

# Display the raw data
display_dataframe(train_raw_df.head(5));

Unnamed: 0,id,l1_category_name,l2_category_name,l3_category_name,title,category_id,price
0,7f1f33cf-a59a-4114-8f9e-cf3effaa2341,Men's Fashion,Men's Watches,,Fossil Leather Watch,20,100.56
1,88765cd5-3f53-403c-841a-12dc367843c9,Men's Fashion,Men's Watches,,* FREE DELIVERY * JDM Brand New 100% Authentic Seiko Presage White Dial & Blue Hands Men's Automatic Dress Watch SARX033,20,1008.37
2,264e2900-ba35-4ac0-8858-e2fcb0909e6d,Men's Fashion,Men's Watches,,St Dupont Classic Pen,20,220.31
3,ffc7337a-7d04-4fa8-a136-ac9d8413f8e8,Men's Fashion,Men's Watches,,CASIO G-SHOCK GX-56BB-1DR / GX-56BB-1D / GX-56BB-1 / GX-56BB TOUGH SOLAR WATCH,20,104.95
4,34a2e32c-db54-4aff-b89d-173bacbac346,Men's Fashion,Men's Watches,,Fossil Townsman Twist ME1164P,20,166.37


In [5]:
# Load the data
X_test_raw_df = pd.read_csv("./Data/01_Raw_Data/test.csv")

# Display the raw data
display_dataframe(X_test_raw_df.head(5))

Unnamed: 0,id,title,price
0,c90123c0-09a2-4d0d-8ddd-447cd12edd4a,Yeezy boost 350 V2 Core Black/Ref,402.41
1,e1dc8ac7-6a89-4ef9-a790-3f4d0835aaf7,Nakamichi Sports Bluetooth Earphones,44.93
2,520743b2-6c83-4ba9-98a2-3d2fff697fa2,Tefal Steam Cuisine 900 Turbo Diffusion,104.0
3,e5136110-bd9b-4011-81f3-de904d916c91,B5 Lofter High Gain Wireless USB Adapter 1200Mbps,21.64
4,a459aa14-7e28-4a95-8c98-690509c373db,Taiwan Customize Handmade APO Float,41.6


## 2.2 - Global Parameters

In [6]:
# Label Encoding of "l1_category_name"
LE = preprocessing.LabelEncoder()
LE.fit(train_raw_df.l1_category_name.unique())
y_train_transformed = LE.transform(train_raw_df.l1_category_name.values)

# Compute the number of classes
CLASSES_L = LE.transform(train_raw_df.l1_category_name.unique())
NUM_CLASSES = len(CLASSES_L)

display_title("Number of classes to distinguish")
print(f"Number of classes to predict? {NUM_CLASSES}")
print("\n")

# Compute class weights
CLASS_WEIGHTS_L = compute_class_weight(
    class_weight="balanced",
    classes=CLASSES_L,
    y=y_train_transformed
)
CLASS_WEIGHTS_DCT = {l1_category_id: CLASS_WEIGHTS_L[i] for i, l1_category_id in enumerate(CLASSES_L)}

display_title("Class Weights")
pprint.pprint(CLASS_WEIGHTS_DCT)
print("\n")

# Random state
RANDOM_STATE = 42

# DEEP LEARNING MODEL PARAMETERS
BATCH_SIZE = 8
EPOCHS = 10

----- [1m Number of classes to distinguish [0m -----
Number of classes to predict? 13


----- [1m Class Weights [0m -----
{0: 1.850707323936456,
 1: 19.80454421023816,
 2: 0.5897082677839274,
 3: 2.65003663003663,
 4: 2.7965210668728258,
 5: 0.5218564256448728,
 6: 0.6002671689221144,
 7: 0.35906038602986795,
 8: 6.03915021495054,
 9: 4.315685865123632,
 10: 1.7350200851369986,
 11: 0.7082953955806189,
 12: 0.8158142524484239}




# 3 - Data Cleaning and Preparation

Comments
-----

Please find below our modelisation hypotheses and choices:
* Following our observations in the notebook "1-DATA-EXPLORATION", we will discard the feature 'price' as it presents several difficulties (low information content, null values, extremely high values, etc)
* As we have only a limited number of samples for some category_id, we will train our model to recognise the target "l1_category_name". By doing, so we will decrease the number of class to predict, in the same way, as our main goal is to maximise the top-3 classification and each "l1_category_name" is associated to at most 3 category_id, it will be more natural to do a top-3 classification. Finally, by choosing the target "l1_category_name", we will have target class that are more orthogonal to each other. The learning should therefore be easier.

## 3.1 - Data Cleaning and Preparation for Model Ingestion

In [7]:
def data_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the dataFrame and prepare it for model ingestion."""
    # Create a new copy of df
    new_df = df.copy(deep=True)
    
    # Set the column "id" as the index
    new_df.set_index("id", inplace=True)
    
    # Keep only the columns "l1_category_name" if it exists and "title"
    # The column 'price' is dropped for now
    columns_to_drop_l = [col for col in new_df.columns if col not in ["title", "l1_category_name"]]
    new_df.drop(columns=columns_to_drop_l, inplace=True)

    # Transform the column "l1_category_name" if it exists
    if "l1_category_name" in new_df.columns:
        new_df["l1_category_name"] = LE.transform(new_df["l1_category_name"])
    
    return new_df

In [8]:
# Preprocess the data for cleaning
train_df = data_preprocessing(train_raw_df)
X_test_df = data_preprocessing(X_test_raw_df)

# And save the preprocess dataframe
train_df.to_csv("./Data/02_Data_Features/train.csv")
X_test_df.to_csv("./Data/02_Data_Features/X_test.csv")

# Display the resulting dataFrames
display_dataframe(train_df.head(5))

Unnamed: 0_level_0,l1_category_name,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
7f1f33cf-a59a-4114-8f9e-cf3effaa2341,7,Fossil Leather Watch
88765cd5-3f53-403c-841a-12dc367843c9,7,* FREE DELIVERY * JDM Brand New 100% Authentic Seiko Presage White Dial & Blue Hands Men's Automatic Dress Watch SARX033
264e2900-ba35-4ac0-8858-e2fcb0909e6d,7,St Dupont Classic Pen
ffc7337a-7d04-4fa8-a136-ac9d8413f8e8,7,CASIO G-SHOCK GX-56BB-1DR / GX-56BB-1D / GX-56BB-1 / GX-56BB TOUGH SOLAR WATCH
34a2e32c-db54-4aff-b89d-173bacbac346,7,Fossil Townsman Twist ME1164P


# 4 - Tokenization of the Sentences & Build Models Inputs

## 4.1 - Split the Training set into a training and a validation set

In [9]:
def prepare_model_training_inputs(
    df: pd.DataFrame,
    test_size: float=0.1,
    num_classes: int=NUM_CLASSES
) -> List[Union[pd.DataFrame, np.array]]:
    """Prepare model inputs."""
    # Divide the training set between a training and validation set
    X_train_df, y_train_df = df["title"], df["l1_category_name"]
        
    # Split the training set into a training and validation set
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_df,
        y_train_df,
        test_size=test_size,
        random_state=RANDOM_STATE,
        shuffle=True,
        stratify=y_train_df.values,   
    )

    # Convert to dataFrames
    X_train_df = X_train.to_frame(name="title")
    X_val_df = X_val.to_frame(name="title")
    
    # Append the ground truth as a label
    X_train_df["label"] = y_train
    X_val_df["label"] = y_val
    
    return X_train_df, X_val_df

In [10]:
# Split the training set
X_train_df, X_val_df = prepare_model_training_inputs(train_df, test_size=0.1)

# Display resulting traning set
display_dataframe(X_train_df.head(5))

Unnamed: 0_level_0,title,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8020f93b-ff12-4a47-b0db-bbac8b9c46b4,wtt/wts supreme mercenary tee,7
65403f49-f3b7-4dfc-8b31-030bd68068aa,Big sister little sister red sister,0
d6fc13f0-ba1c-49ec-af92-924251543e7c,Steamboat,6
2e2c1470-7662-4856-bc05-bbfd491957c3,Kmart paperbag linen shorts navy,12
374de8af-5e93-40d9-98a0-b81c2fb5b909,Taurus 3D Crystal block-Taurus,11


In [11]:
display_dataframe(X_val_df.head(5))

Unnamed: 0_level_0,title,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
adc6669b-dbc1-4d03-b504-aa34ac61cf61,LEGO Overwatch Hanzo vs Genji,11
e0f02990-0fba-4fa9-bdd2-dbfc70d602ec,Denim Flare Pants,12
c1940623-df11-446b-bd0c-0f20576a0f35,$5 OFF,7
cc9f1382-e2f8-4f8c-9925-bf80c697a014,2019 autumn and winter new business casual fashion printin,7
ab343b1d-8a18-4ef2-b8cd-e7afe30ff63e,Silver Simple Anchor Triple Brown Leather Tour Bracelet,7


In [12]:
display_dataframe(X_test_df.head(5))

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
c90123c0-09a2-4d0d-8ddd-447cd12edd4a,Yeezy boost 350 V2 Core Black/Ref
e1dc8ac7-6a89-4ef9-a790-3f4d0835aaf7,Nakamichi Sports Bluetooth Earphones
520743b2-6c83-4ba9-98a2-3d2fff697fa2,Tefal Steam Cuisine 900 Turbo Diffusion
e5136110-bd9b-4011-81f3-de904d916c91,B5 Lofter High Gain Wireless USB Adapter 1200Mbps
a459aa14-7e28-4a95-8c98-690509c373db,Taiwan Customize Handmade APO Float


## 4.2 - Tokenize the input title

Comments
-----

In [13]:
# Define a Tokenizer
TOKENIZER = AutoTokenizer.from_pretrained("albert-base-v2")

# Display tokenizer configuration
print(TOKENIZER)

PreTrainedTokenizerFast(name_or_path='albert-base-v2', vocab_size=30000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False)})


In [14]:
def tokenize_function(example, column: str="title", tokenizer=TOKENIZER):
    """Apply the tokenizers on the text contained in row."""
    return tokenizer(str(example["title"]))

In [15]:
# Load the previous dataFrames as dataset
X_train_dataset = Dataset.from_pandas(X_train_df)
X_val_dataset = Dataset.from_pandas(X_val_df)
X_test_dataset = Dataset.from_pandas(X_test_df)

# Apply the tokenizer on our training and testing set
X_train_tokenized_dataset = X_train_dataset.map(tokenize_function)
X_val_tokenized_dataset = X_val_dataset.map(tokenize_function)
X_test_tokenized_dataset = X_test_dataset.map(tokenize_function)

# Display the resulting dataFrames
display_title("Resulting Datasets after Tokenisation")
print(X_train_tokenized_dataset)
print("\n ")
print("First Example:")
pprint.pprint(X_train_tokenized_dataset[0])

HBox(children=(FloatProgress(value=0.0, max=130222.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14470.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48560.0), HTML(value='')))


----- [1m Resulting Datasets after Tokenisation [0m -----
Dataset({
    features: ['attention_mask', 'id', 'input_ids', 'label', 'title', 'token_type_ids'],
    num_rows: 130222
})

 
First Example:
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'id': '8020f93b-ff12-4a47-b0db-bbac8b9c46b4',
 'input_ids': [2, 619, 38, 38, 118, 499, 38, 18, 2510, 24666, 10366, 3],
 'label': 7,
 'title': 'wtt/wts supreme mercenary tee',
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [16]:
def prepare_datasets_for_model_ingestion(
    dataset: Dataset,
    tokenizer=TOKENIZER,
    mode: str="train"
) -> tf.data.Dataset:
    """Prepare Dataset for model ingestion."""
    # Convert dataset to tensor of tensorflow
    tf_dataset = dataset.remove_columns(["id", "title"]).with_format("tensorflow").select(range(20000))
    
    # Build training samples for tensorflow
    if mode == "train":
        tf_dataset_features = {x: tf_dataset[x].to_tensor() for x in tokenizer.model_input_names}
        y = to_categorical(tf_dataset["label"], num_classes=NUM_CLASSES)
        #y = tf_dataset["label"]
        tf_dataset_samples = tf.data.Dataset.from_tensor_slices((tf_dataset_features, y))
        tf_dataset_samples = tf_dataset_samples.shuffle(len(tf_dataset_samples)).batch(BATCH_SIZE)
    
    # Build testing samples
    if mode == "test":    
        tf_dataset_features = {x: tf_dataset[x].to_tensor() for x in tokenizer.model_input_names}
        tf_dataset_samples = tf.data.Dataset.from_tensor_slices(tf_dataset_features)
        tf_dataset_samples = tf_dataset_samples.shuffle(len(tf_dataset_samples)).batch(BATCH_SIZE)
        
    return tf_dataset_samples

In [17]:
# Prepare datatsets for model ingestion
train_tf = prepare_datasets_for_model_ingestion(X_train_tokenized_dataset, mode="train")
val_tf = prepare_datasets_for_model_ingestion(X_val_tokenized_dataset, mode="train")
test_tf = prepare_datasets_for_model_ingestion(X_test_tokenized_dataset, mode="test")

# 5 - Model Fine Tuning

Comments
-----

## 5.1 - Load Pre-Trained Model

In [18]:
# Load a pre-trained model
MODEL = TFAutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=NUM_CLASSES)

All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5.2 - Model Compilation

In [19]:
# Compilation of the model and training
MODEL.compile(
    optimizer=Adam(),
    loss=CategoricalCrossentropy(),
    metrics=CategoricalAccuracy(), #Accuracy(),
)

# Do not train Bert layer
for layer in MODEL.layers:
    if layer.name == "albert":
        layer.trainable = False

# Print Model summary
print(MODEL.summary())

# Define Callbacks
checkpoint_filepath = './drive/MyDrive/Carousell-Test/Logs/checkpoint-best-model-{epoch:02d}-{val_categorical_accuracy:.2f}'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_categorical_accuracy',
    mode='max',
    save_best_only=True
)

Model: "tf_albert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
albert (TFAlbertMainLayer)   multiple                  11683584  
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  9997      
Total params: 11,693,581
Trainable params: 9,997
Non-trainable params: 11,683,584
_________________________________________________________________
None


## 5.3 - Model Training

In [20]:
# Check if a GPU is available
if tf.test.gpu_device_name(): 
    print(f"Default GPU Device: {tf.test.gpu_device_name()}")
else:
    print("Please install the GPU version of TF or no GPU is available")

Please install the GPU version of TF or no GPU is available


In [21]:
MODEL.fit(
    train_tf,
    validation_data=val_tf,
    epochs=EPOCHS,
    verbose=1,
    class_weight=CLASS_WEIGHTS_DCT,
    callbacks=[model_checkpoint_callback]
)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
  12/2500 [..............................] - ETA: 2:28:01 - loss: 10.8654 - categorical_accuracy: 0.1146

KeyboardInterrupt: 

Comments
-----

Despite using a GPU, my ressources were too limited to train a transformer. That is why I switched to a simpler model based on TF-IDF