<a href="https://colab.research.google.com/github/SamyFellah/hierarchical_multilabel_classification/blob/main/hierarchical_multilabel_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle keras-preprocessing
!nvidia-smi -L

/bin/bash: nvidia-smi: command not found


In [None]:

import os

import pandas as pd
import tensorflow as tf
from kaggle.api.kaggle_api_extended import KaggleApi
from keras.applications.mobilenet_v2 import MobileNetV2
from keras.utils import to_categorical
from keras_preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split


In [None]:
os.environ['KAGGLE_USERNAME'] = "........."
os.environ['KAGGLE_KEY'] = "................"


def download_and_extract_kaggle_dataset(dataset_name, destination, unzip=True, quiet=False):
    # Set up Kaggle API credentials
    api = KaggleApi()
    api.authenticate()

    # Download the Kaggle dataset to the specified destination folder
    api.dataset_download_files(dataset_name, path=destination, unzip=unzip, quiet=quiet)


download_and_extract_kaggle_dataset('paramaggarwal/fashion-product-images-small', 'data/', unzip=True, )


Downloading fashion-product-images-small.zip to data


100%|██████████| 565M/565M [00:19<00:00, 30.2MB/s]





In [None]:


class FashionModel(tf.keras.Model):
    def __init__(self, input_shape, gender_output_shape, master_category_output_shape, sub_category_output_shape, article_type_output_shape):
        super().__init__()
        self.features_layer = tf.keras.applications.MobileNet(include_top=False, weights='imagenet',
                                                             input_shape=input_shape, pooling='avg')
        self.features_layer.trainable = False

        self.gender_dense = tf.keras.layers.Dense(1024, activation = 'relu')
        self.article_type_dense = tf.keras.layers.Dense(1024, activation = 'relu')
        self.master_category_dense = tf.keras.layers.Dense(1024, activation = 'relu')
        self.sub_category_dense = tf.keras.layers.Dense(1024, activation = 'relu')

        self.gender_output = tf.keras.layers.Dense(gender_output_shape, activation='softmax', name='gender_output')
        self.article_type_output = tf.keras.layers.Dense(article_type_output_shape, activation='softmax', name='article_type_output')
        self.master_category_output = tf.keras.layers.Dense(master_category_output_shape, activation='softmax', name='master_category_output')
        self.sub_category_output = tf.keras.layers.Dense(sub_category_output_shape, activation='softmax', name='sub_category_output')

    def call(self, inputs):
        features = self.features_layer(inputs)

        gender_features = self.gender_dense(features)
        master_category_features = self.master_category_dense(features)

        input_sub_category = tf.keras.layers.Concatenate()([master_category_features, features])

        sub_category_features = self.sub_category_dense(input_sub_category)

        input_article_type = tf.keras.layers.Concatenate()([sub_category_features, features])

        article_type_features = self.article_type_dense(input_article_type)

        

        gender_output = self.gender_output(gender_features)
        master_category_output = self.master_category_output(master_category_features)
        sub_category_output = self.sub_category_output(sub_category_features)
        article_type_output = self.article_type_output(article_type_features)


        return gender_output, master_category_output, sub_category_output, article_type_output

class DataPreprocessor:
    def __init__(self, csv_path, images_path, input_shape, test_size, batch_size, random_state=42):
        self.csv_path = csv_path
        self.images_path = images_path
        self.input_shape = input_shape
        self.test_size = test_size
        self.batch_size = batch_size
        self.random_state = random_state

        self.nunique_values = {}
        self.labels = {}
        self.data_shape = ()
        self.train_data_shape = ()
        self.test_data_shape = ()

        print('DataPreprocessor initialized')

    def _process_csv(self):
        df = pd.read_csv(self.csv_path, on_bad_lines='skip')
        print(f'Original Data shape: {df.shape}')

        columns = ['id', 'gender', 'masterCategory', 'subCategory', 'articleType']

        # Filter out the required columns (gender, article_type, master_category and sub_category)
        df = df[columns]
        df['id'] = df['id'].astype(str)
        df = df[df['id'].apply(lambda x: os.path.isfile(os.path.join(self.images_path, x + '.jpg')))]
        df['img_path'] = df['id'].apply(lambda x: x + '.jpg')

        print(f'Filtered Data shape: {df.shape} after removing images that are not present in the folder')
        self.data_shape = df.shape
        for column in columns[1:]:
            self.labels[column] = df[column].astype('category').cat.categories
            df[column] = df[column].astype('category').cat.codes
            self.nunique_values[column] = df[column].nunique()  

        print(df.head(10))
        return df

    def _create_generator(self, dataframe, datagen):
        generator = datagen.flow_from_dataframe(dataframe,
                                                directory=self.images_path,
                                                x_col='img_path',
                                                y_col=[ 'gender', 'masterCategory', 'subCategory', 'articleType'],
                                                target_size=self.input_shape[:2],
                                                class_mode='multi_output',
                                                batch_size=self.batch_size,
                                                shuffle=True,
                                                seed=self.random_state)
        while True:
            data = next(generator)
            images, labels = data[0], data[1]
            # one hot encode the labels
            gender_onehot_encoded = to_categorical(labels[0], num_classes=self.nunique_values['gender'])
            master_category_onehot_encoded = to_categorical(labels[1], num_classes=self.nunique_values['masterCategory'])
            sub_category_onehot_encoded = to_categorical(labels[2], num_classes=self.nunique_values['subCategory'])
            article_type_onehot_encoded = to_categorical(labels[3], num_classes=self.nunique_values['articleType'])

            one_hot_labels = [gender_onehot_encoded, master_category_onehot_encoded, sub_category_onehot_encoded, article_type_onehot_encoded]
            yield images, one_hot_labels

    def get_data_generators(self):
        # Read the CSV file
        df = self._process_csv()

        # Split the dataset
        train_df, test_df = train_test_split(df, test_size=self.test_size, random_state=self.random_state)

        print(f'Train data shape: {train_df.shape}')
        print(f'Test data shape: {test_df.shape}')

        self.train_data_shape = train_df.shape
        self.test_data_shape = test_df.shape

        train_datagen = ImageDataGenerator(rescale=1. / 255)

        test_datagen = ImageDataGenerator(rescale=1. / 255)

        train_generator = self._create_generator(train_df, train_datagen)
        validation_generator = self._create_generator(test_df, test_datagen)

        return train_generator, validation_generator


CSV_PATH = './data/styles.csv'
IMAGE_PATH = './data/images'

log_dir = './logs'
checkpoint_filepath = './checkpoints/model'

RANDOM_STATE = 42
BATCH_SIZE = 32
TEST_SIZE = 0.2
EPOCHS = 30
INPUT_SHAPE = (80, 60, 3)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


# replace default logger with rich logger


def main():
    print('Starting training')

    print('Loading data')
    data_preprocessor = DataPreprocessor(CSV_PATH, IMAGE_PATH, INPUT_SHAPE, TEST_SIZE, BATCH_SIZE, RANDOM_STATE)
    train_generator, validation_generator = data_preprocessor.get_data_generators()
    nunique_values = data_preprocessor.nunique_values
    print(data_preprocessor.labels)
    print('Data loaded')
    print(f'Number of unique values: {nunique_values}')

    print('Building model')
    fashion_model = FashionModel(INPUT_SHAPE, nunique_values['gender'], nunique_values['masterCategory'], nunique_values['subCategory'], nunique_values['articleType'])

    fashion_model.compile(optimizer='adam',
                          loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
                          metrics=['accuracy']
                          )

    history = fashion_model.fit(train_generator,
                                epochs=EPOCHS,
                                batch_size=BATCH_SIZE,
                                validation_data=validation_generator,
                                steps_per_epoch=data_preprocessor.train_data_shape[0] // BATCH_SIZE,
                                validation_steps=data_preprocessor.test_data_shape[0] // BATCH_SIZE,
                                callbacks=[tensorboard_callback, model_checkpoint_callback],
                                verbose=1, )


main()


Starting training
Loading data
DataPreprocessor initialized
Original Data shape: (44424, 10)




Filtered Data shape: (44419, 6) after removing images that are not present in the folder
      id  gender  masterCategory  subCategory  articleType   img_path
0  15970       2               1           38          104  15970.jpg
1  39386       2               1            6           56  39386.jpg
2  59263       4               0           42          139  59263.jpg
3  21379       2               1            6          127  21379.jpg
4  53759       2               1           38          133  53759.jpg
5   1855       2               1           38          133   1855.jpg
6  30805       2               1           38          104  30805.jpg
7  26960       4               1           38          104  26960.jpg
8  29114       2               0           33          110  29114.jpg
9  30039       2               0           42          139  30039.jpg
Train data shape: (35535, 6)
Test data shape: (8884, 6)
{'gender': Index(['Boys', 'Girls', 'Men', 'Unisex', 'Women'], dtype='object'), 'maste

KeyboardInterrupt: ignored