In [37]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tensorflow.keras import utils
from tensorflow.keras.preprocessing import image
import keras_efficientnet_v2

## Import and process data

In [3]:
raw_data = pd.read_csv('marketing_sample_for_myntra_com-ecommerce__20190601_20190831__15k_data.csv', on_bad_lines='skip')

In [4]:
raw_data.shape

(14231, 25)

In [5]:
raw_data.columns

Index(['uniq_id', 'crawl_timestamp', 'product_id', 'link', 'size',
       'variant_sku', 'brand', 'care_instructions', 'dominant_material',
       'title', 'actual_color', 'dominant_color', 'product_type', 'images',
       'body', 'product_details', 'size_fit', 'complete_the_look', 'type',
       'variant_price', 'variant_compare_at_price', 'ideal_for', 'is_in_stock',
       'inventory', 'specifications'],
      dtype='object')

### Selecting the feature columns
- 'dominant_material', 'dominant_color', and 'product_type' would be used as labels.

- 'images' would be used to extract images.

- 'ideal_for' would be used to separate women's data from men's.

In [6]:
cat_columns = ['dominant_material','dominant_color', 'product_type', 'images', 'ideal_for']

In [7]:
data = raw_data[cat_columns]

#### Separating womens from mens and dropping the missing entries

In [8]:
women_data = data[data['ideal_for']=='Women']
women_data.dropna(inplace=True)

In [10]:
women_data.head()


Unnamed: 0,dominant_material,dominant_color,product_type,images,ideal_for
0,Polyester,Black,Top,http://assets.myntassets.com/v1/assets/images/...,Women
3,Chiffon,Pink,Dupatta,http://assets.myntassets.com/v1/assets/images/...,Women
8,Polyester,Maroon,A-Line Kurta,http://assets.myntassets.com/v1/assets/images/...,Women
10,Viscose Rayon,Navy,Printed Palazzos,http://assets.myntassets.com/v1/assets/images/...,Women
11,Rayon,Blue,Straight Kurta,http://assets.myntassets.com/v1/assets/images/...,Women


#### As we can see below there are too many classes of product types, and material types. Many of the classes are of similar kind to other ones. Hence, products of similar types are merged into one category 

In [11]:
len(women_data.product_type.unique()), women_data.product_type.unique()

(206,
 array(['Top', 'Dupatta', 'A-Line Kurta', 'Printed Palazzos',
        'Straight Kurta', 'Kurta with Palazzos', 'Shrug',
        'Straight Palazzo', 'Kurta with Trousers & Dupatta', 'Tunic',
        'Lehenga & Blouse with Dupatta', 'Treggings', 'A-Line Dress',
        'Flared Palazzo', 'Kurta with Churidar & Dupatta',
        'Cropped Palazzo', 'Straight Kurti', 'Maxi Dress', 'Midi Skirt',
        'Regular Trousers', 'Fit and Flare Dress', 'Solid Trousers',
        'Layered A-Line Kurta', 'Skirt', 'Saree', 'Kurta with Churidar',
        'Shawl', 'Wide Leg Palazzo', 'Kurta with Trousers',
        'Kurti with Trousers', 'Kurti', 'Kurti with Palazzos',
        'Layered Maxi Dress', 'Printed Kurta', 'Winter Kurta',
        'Ready to Wear Lehenga with Blouse', 'Kaftan Top', 'Midi Dress',
        'Lehenga & Blouse', 'Solid Kurta', 'A-Line Kurti', 'Maxi Skirt',
        'A-Line Top', 'Fusion Kurta', 'Solid Maxi Dress',
        'Kurta with Ethnic Jacket', 'Maxi Flared Skirt',
        'Kurt

In [12]:
len(women_data.dominant_material.unique()),women_data.dominant_material.unique()

(38,
 array(['Polyester', 'Chiffon', 'Viscose Rayon', 'Rayon', 'Silk',
        'viscose', 'cotton', 'Cotton', 'Net', 'Chanderi', 'polyester',
        'Liva', 'Linen', 'Acrylic', 'rayon', 'acrylic', 'Viscose', 'Modal',
        'Georgette', 'liva', 'Velvet', 'silk', 'tencil', 'Crepe',
        'Poly Silk', 'Satin', 'modal', 'geicha', 'Dupion', 'Pure Silk',
        'Wool', 'georgette', 'Khadi', 'Nylon', 'lyocell', 'linen', 'SILK',
        'wool'], dtype=object))

In [13]:
len(women_data.dominant_color.unique()),women_data.dominant_color.unique()

(41,
 array(['Black', 'Pink', 'Maroon', 'Navy', 'Blue', 'Coffee Brown', 'White',
        'Red', 'Charcoal', 'Yellow', 'Beige', 'Orange', 'Grey',
        'Sea Green', 'Green', 'Olive', 'Mustard', 'Fuchsia', 'Brown',
        'Teal', 'Purple', 'Rust', 'Coral', 'Lime Green', 'Magenta',
        'Turquoise Blue', 'Lavender', 'Taupe', 'Mauve', 'Burgundy',
        'Silver', 'Khaki', 'Cream', 'Peach', 'Fluorescent Green', 'Rose',
        'Off White', 'Multi', 'Tan', 'Gold', 'Dark Green'], dtype=object))

In [None]:

for index, value in women_data.product_type.iteritems():
    if 'Kurta' in value or 'Kurti' in value:
        women_data.product_type[index] = 'Kurta'
    if 'blouse' in value.lower():
        women_data.product_type[index] = 'Blouse'
    if 'dress' in value.lower():
        women_data.product_type[index] = 'Dress'
    if 'skirt' in value.lower():
        women_data.product_type[index] = 'Skirt'
    if 'palazzo' in value.lower():
        women_data.product_type[index] = 'Palazzo'
    if 'top' in value.lower():
        women_data.product_type[index] = 'Top'
    if 'shirt' in value.lower():
        women_data.product_type[index] = 'Shirt'
    if 'dupatta' in value.lower():
        women_data.product_type[index] = 'Dupatta'
    if 'trouser' in value.lower():
        women_data.product_type[index] = 'Trousers'
    if 'tunic' in value.lower():
        women_data.product_type[index] = 'Tunic'

In [None]:
for index, value in women_data.dominant_material.iteritems():
    if 'rayon' in value.lower():
        women_data.dominant_material[index] = 'Rayon'
    if 'polyester' in value.lower():
        women_data.dominant_material[index] = 'Polyester'
    if 'cotton' in value.lower():
        women_data.dominant_material[index] = 'Cotton'
    if 'acrylic' in value.lower():
        women_data.dominant_material[index] = 'Acrylic'
    if 'silk' in value.lower():
        women_data.dominant_material[index] = 'Silk'
    if 'wool' in value.lower():
        women_data.dominant_material[index] = 'Wool'
    if 'viscose' in value.lower():
        women_data.dominant_material[index] = 'Viscose'
    if 'linen' in value.lower():
        women_data.dominant_material[index] = 'linen'


#### Making final list of classses and converting classes from strings to integers

In [16]:
color_classes = list(women_data.dominant_color.unique())
material_classes = list(women_data.dominant_material.unique())
product_type_classes = list(women_data.product_type.unique())

In [None]:
women_data['dominant_color'].replace(color_classes, [i for i in range(len(color_classes))], inplace=True)
women_data['dominant_material'].replace(material_classes, [i for i in range(len(material_classes))], inplace=True)
women_data['product_type'].replace(product_type_classes, [i for i in range(len(product_type_classes))], inplace=True)


### We already have the data downloaded so the below commented steps won't be necessary


Every data entry has multiple urls of images. Only the first image url from those are kept for each entry

In [19]:
# for index, value in women_data.images.iteritems():
#     women_data.images[index] = value.split()[0]
    

In [20]:
# images_url = list(women_data.images)


In [21]:
# len(images_url)

In [22]:
# os.mkdir('images')

In [23]:
# for i, x in enumerate(images_url):
#     utils.get_file(os.path.join('/images', f"{i}.jpg"), origin=x)

Below step is to make sure the data is retrieved in the right order since the data downloaded from  women_data['images'] were in order - 0,1,2,3,4,...

In [24]:
images = os.listdir('images')
images_idx = [i for i in range(len(images))]

In [26]:
print(len(images_idx))
assert len(images_idx) == women_data.shape[0]

7381


#### Converting the images into arrays and creating a dataset which can be fed into the neural network.

In [None]:
train_image = []
for i in images_idx:
    path = os.path.join('images', str(i))
    img = image.load_img(path + '.jpg', target_size=(224,224,3))
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)

In [28]:
X = np.array(train_image)

In [29]:
X.shape

(7381, 224, 224, 3)

In [30]:
color_labels = np.array(women_data.dominant_color)
type_labels = np.array(women_data.product_type)
material_labels = np.array(women_data.dominant_material)
color_labels.shape, type_labels.shape, material_labels.shape

((7381,), (7381,), (7381,))

#### 7381 images and 7381 labels each for color, product type, material type are available in the final dataset.

# 
## Training the model

#### EfficientNet-V2-B1 is used as the base model and transfer learning with further fine tuning is used to train the model.

In [126]:
base_model = keras_efficientnet_v2.EfficientNetV2B1(pretrained='imagenet',num_classes=0, input_shape=(224,224,3), include_preprocessing=False)

>>>> Load pretrained from: C:\Users\suraj\.keras\models/efficientnetv2\efficientnetv2-b1-imagenet.h5


#### The top layers after the base model diverge into three branches for three different output, one each for predicting color, type, and material.

In [127]:
x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
color_layer = tf.keras.layers.Dense(512, activation='relu')(x)
color_layer = tf.keras.layers.Dropout(0.2)(color_layer)
color_layer = tf.keras.layers.Dense(256, activation='relu')(color_layer)
color_output = tf.keras.layers.Dense(len(color_classes), activation='softmax',name='color')(color_layer)

type_layer = tf.keras.layers.Dense(512, activation='relu')(x)
type_layer = tf.keras.layers.Dropout(0.2)(type_layer)
type_layer = tf.keras.layers.Dense(256, activation='relu')(type_layer)
type_output = tf.keras.layers.Dense(len(product_type_classes), activation='softmax',name='type')(type_layer)

material_layer = tf.keras.layers.Dense(512, activation='relu')(x)
material_layer = tf.keras.layers.Dropout(0.2)(material_layer)
material_layer = tf.keras.layers.Dense(256, activation='relu')(material_layer)
material_output = tf.keras.layers.Dense(len(material_classes), activation='softmax', name='material')(material_layer)

In [146]:
model = tf.keras.Model(inputs=base_model.input, outputs= [color_output, type_output, material_output])

The layers of the base model are frozen initially to begin transfer learning.

In [147]:
for layer in base_model.layers:
    layer.trainable = False


In [148]:
model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','sparse_categorical_crossentropy','sparse_categorical_crossentropy'], metrics=['accuracy'])
history = model.fit(X, [color_labels, type_labels, material_labels],batch_size=4, epochs=10, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save('trained_bottom_layers.h5')

The first two stacks (out of 5) of the Efficientnet-v2-B1 base model are left frozen since we don't have a huge dataset and the low level features need not necessarily be trained again.

Fine-tuning is done for the layers starting from the 3rd stack. 

In [155]:
model.layers.index(model.get_layer(name='stack_3_block0_sortcut_conv'))


45

In [156]:
for layer in base_model.layers[45:]:
    layer.trainable = True

In [157]:
model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','sparse_categorical_crossentropy','sparse_categorical_crossentropy'], metrics=['accuracy'])
history = model.fit(X, [color_labels, type_labels, material_labels],batch_size=4, epochs=15, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


A slight but definitely an improvement in the model performance can be seen after fine-tuning.

#### The final model and the weights are saved for further uses

In [158]:
model.save('final_model.h5')

### THANK YOU