#### Since the data is highly imbalanced, we take 11,000 samples out of each category.
#### The categories being: 
##### 1) Baby/Children
##### 2) Divided
##### 3) Ladieswear
##### 4) Menswear
##### 5) Sport

##### We also remove samples which don't have detail_desc

##### This data will be used to train the image and text embeddings

In [1]:
import os
import pandas as pd
from PIL import Image

In [2]:
articles_df = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/articles.csv")

In [3]:
len(articles_df)

105542

In [4]:
articles_df = articles_df[["article_id", "index_group_name", "detail_desc"]]

In [5]:
articles_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,108775015,Ladieswear,Jersey top with narrow shoulder straps.
1,108775044,Ladieswear,Jersey top with narrow shoulder straps.
2,108775051,Ladieswear,Jersey top with narrow shoulder straps.
3,110065001,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."


In [6]:
articles_df["detail_desc"].isna().unique()

array([False,  True])

In [7]:
articles_df.dropna(subset=['detail_desc'], how='all', inplace=True)

In [8]:
## missing values have been removed
articles_df["detail_desc"].isna().unique()

array([False])

In [9]:
articles_df["index_group_name"].unique()

array(['Ladieswear', 'Baby/Children', 'Menswear', 'Sport', 'Divided'],
      dtype=object)

In [10]:
index_group = articles_df.groupby("index_group_name")

In [11]:
for name, group in index_group:
    print('name:', name, "|", "size:", len(group), "%:", len(group)/len(articles_df))

name: Baby/Children | size: 34619 %: 0.3293095903962864
name: Divided | size: 15086 %: 0.14350398569335845
name: Ladieswear | size: 39523 %: 0.3759583737610106
name: Menswear | size: 12539 %: 0.11927591651922455
name: Sport | size: 3359 %: 0.03195213363012005


In [12]:
training_split = int(0.6*len(articles_df))
print("Training split", training_split)
print("Samples for Baby/Child:", int(0.32*training_split))
print("Samples for Divided:", int(0.14*training_split))
print("Samples for Ladieswear:", int(0.37*training_split))
print("Samples for menswear", int(0.11 * training_split))
print("samples for sport", int(0.03 * training_split))


Training split 63075
Samples for Baby/Child: 20184
Samples for Divided: 8830
Samples for Ladieswear: 23337
Samples for menswear 6938
samples for sport 1892


In [13]:
#no sampling for "Sport"
sport_df_train = articles_df[articles_df["index_group_name"] == "Sport"].sample(n=int(0.03 * training_split))

baby_children_df_train = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=int(0.32*training_split))

divided_df_train = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=int(0.14*training_split))

ladieswear_df_train = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=int(0.37*training_split))

menswear_df_train = articles_df[articles_df["index_group_name"] == "Menswear"].sample(n=int(0.11 * training_split))




train_df = pd.concat([sport_df_train,
                        baby_children_df_train,
                        divided_df_train,
                        ladieswear_df_train,
                        menswear_df_train], axis=0)

In [14]:
train_df.head(10)

Unnamed: 0,article_id,index_group_name,detail_desc
11549,548335002,Sport,Loose-fitting sports top in fast-drying functi...
38132,656649003,Sport,Sports tights in fast-drying functional fabric...
78501,794321004,Sport,Outdoor jacket in woven fabric with a double-l...
5698,498742004,Sport,Wide sports top in fast-drying functional fabr...
28654,623781009,Sport,Short-sleeved sports tops in fast-drying funct...
39533,661735001,Sport,"Running jacket in fast-drying, breathable func..."
9495,533406011,Sport,Sports joggers in sturdy sweatshirt fabric wit...
95025,865142001,Sport,Cropped sports top in fast-drying mesh with a ...
89255,837141001,Sport,"Slightly shorter, boxy-style sports top in sof..."
49356,694514001,Sport,Sports top in fast-drying functional fabric wi...


In [15]:
articles_df = articles_df.drop(train_df.index)

In [16]:
len(articles_df)

43945

In [17]:
train_df.reset_index()

Unnamed: 0,index,article_id,index_group_name,detail_desc
0,11549,548335002,Sport,Loose-fitting sports top in fast-drying functi...
1,38132,656649003,Sport,Sports tights in fast-drying functional fabric...
2,78501,794321004,Sport,Outdoor jacket in woven fabric with a double-l...
3,5698,498742004,Sport,Wide sports top in fast-drying functional fabr...
4,28654,623781009,Sport,Short-sleeved sports tops in fast-drying funct...
...,...,...,...,...
61176,20935,591439006,Menswear,T-shirt in cotton jersey with a print motif on...
61177,51933,699925001,Menswear,Cargo shorts in cotton twill with a regular wa...
61178,78859,795836012,Menswear,Long-sleeved shirt in washed Oxford cotton wit...
61179,51719,699580001,Menswear,Joggers in a linen and cotton weave in a relax...


In [18]:
val_split = int(len(articles_df)/2)

print("Val split", val_split)
print("Samples for Baby/Child:", int(0.32*val_split))
print("Samples for Divided:", int(0.14*val_split))
print("Samples for Ladieswear:", int(0.37*val_split))
print("Samples for menswear", int(0.11 * val_split))
print("samples for sport", int(0.03 * val_split))

Val split 21972
Samples for Baby/Child: 7031
Samples for Divided: 3076
Samples for Ladieswear: 8129
Samples for menswear 2416
samples for sport 659


In [19]:
sport_df_val = articles_df[articles_df["index_group_name"] == "Sport"].sample(n=int(0.03 * val_split))

baby_children_df_val = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=int(0.32*val_split))

divided_df_val = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=int(0.14*val_split))

ladieswear_df_val = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=int(0.37*val_split))

menswear_df_val = articles_df[articles_df["index_group_name"] == "Menswear"].sample(n=int(0.11 * val_split))


val_df = pd.concat([sport_df_val,
                        baby_children_df_val,
                        divided_df_val,
                        ladieswear_df_val,
                        menswear_df_val], axis=0)

In [20]:
test_df = articles_df.drop(val_df.index)

In [21]:
val_df.reset_index()
test_df.reset_index()

Unnamed: 0,index,article_id,index_group_name,detail_desc
0,6,111565001,Ladieswear,"Semi shiny nylon stockings with a wide, reinfo..."
1,8,111586001,Ladieswear,Tights with built-in support to lift the botto...
2,10,111609001,Ladieswear,Opaque matt tights. 200 denier.
3,16,118458003,Menswear,Trousers in sweatshirt fabric with an elastica...
4,17,118458004,Menswear,Trousers in sweatshirt fabric with an elastica...
...,...,...,...,...
22629,105517,947509001,Ladieswear,Narrow belt in leather with a metal buckle. Th...
22630,105520,947599001,Ladieswear,"Long-sleeved tops in soft, organic cotton jers..."
22631,105528,949198001,Ladieswear,Joggers in sweatshirt fabric made from a cotto...
22632,105530,949551001,Divided,"Short, boxy-style top in sweatshirt fabric mad..."


In [22]:
len(val_df) + len(test_df) + len(train_df)

105126

In [23]:
def get_image_path(balanced_df):
    final_rows = []
    mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")

    for (_, row) in balanced_df.iterrows():
        location = "0" + str(row["article_id"])[:2] + "/" + "0" + str(row["article_id"]) + ".jpg"
        path = os.path.join(mypath, *location.split("/"))
        if os.path.isfile(path):
            final_rows.append([row["article_id"], row["index_group_name"], row["detail_desc"], location])
    
    return final_rows

In [24]:
train_rows = get_image_path(train_df)
val_rows = get_image_path(val_df)
test_rows = get_image_path(test_df)

# final_df = pd.DataFrame(final_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )

In [25]:
len(train_rows) + len(val_rows) + len(test_rows)

104695

In [26]:
train_final = pd.DataFrame(train_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"])
val_final = pd.DataFrame(val_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )
test_final = pd.DataFrame(test_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )

In [27]:

import shutil

def write_images(final_df, tag=""):
    image_paths = final_df["location"].tolist()
    dest_folder = tag + "_images"
    dest_path = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", dest_folder)
    os.mkdir(dest_path)

    mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")
    newsize = (256, 256)
    for image in image_paths:
        img_path = os.path.join(mypath, *image.split("/"))
        img = Image.open(img_path)
        img = img.resize(newsize)
        dest_loc = os.path.join(dest_path, image.split("/")[1])
        img.save(dest_loc)


In [28]:
import numpy as np

In [None]:
train_img = "test_images"


faulty = []
for name in os.listdir(os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", train_img)):

    img = Image.open((os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", train_img, name)))
    if len(img.getbands()) != 3:
        faulty.append(name)
        

In [None]:
faulty

'0616100001.jpg'

In [30]:
# train_final[train_final['article_id'] == '0616100001.jpg']

In [31]:
write_images(train_final, "train")

In [33]:
write_images(val_final, "val")

In [34]:
write_images(test_final, "test")

In [35]:
# export dataframe to csv
train_final.to_csv("train_data.csv")
val_final.to_csv("val_data.csv")
test_final.to_csv("test_data.csv")