#### Since the data is highly imbalanced, we take 11,000 samples out of each category.
#### The categories being: 
##### 1) Baby/Children
##### 2) Divided
##### 3) Ladieswear
##### 4) Menswear
##### 5) Sport

##### We also remove samples which don't have detail_desc

##### This data will be used to train the image and text embeddings

In [1]:
import os
import pandas as pd
from PIL import Image

In [2]:
articles_df = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/articles.csv")

In [3]:
len(articles_df)

105542

In [4]:
articles_df = articles_df[["article_id", "index_group_name", "detail_desc"]]

In [5]:
articles_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,108775015,Ladieswear,Jersey top with narrow shoulder straps.
1,108775044,Ladieswear,Jersey top with narrow shoulder straps.
2,108775051,Ladieswear,Jersey top with narrow shoulder straps.
3,110065001,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."


In [6]:
articles_df["detail_desc"].isna().unique()

array([False,  True])

In [7]:
articles_df.dropna(subset=['detail_desc'], how='all', inplace=True)

In [8]:
## missing values have been removed
articles_df["detail_desc"].isna().unique()

array([False])

In [9]:
articles_df["index_group_name"].unique()

array(['Ladieswear', 'Baby/Children', 'Menswear', 'Sport', 'Divided'],
      dtype=object)

In [10]:
index_group = articles_df.groupby("index_group_name")

In [11]:
for name, group in index_group:
    print('name:', name, "|", "size:", len(group), "%:", len(group)/len(articles_df))

name: Baby/Children | size: 34619 %: 0.3293095903962864
name: Divided | size: 15086 %: 0.14350398569335845
name: Ladieswear | size: 39523 %: 0.3759583737610106
name: Menswear | size: 12539 %: 0.11927591651922455
name: Sport | size: 3359 %: 0.03195213363012005


In [12]:
training_split = int(0.6*len(articles_df))
print("Training split", training_split)
print("Samples for Baby/Child:", int(0.32*training_split))
print("Samples for Divided:", int(0.14*training_split))
print("Samples for Ladieswear:", int(0.37*training_split))
print("Samples for menswear", int(0.11 * training_split))
print("samples for sport", int(0.03 * training_split))


Training split 63075
Samples for Baby/Child: 20184
Samples for Divided: 8830
Samples for Ladieswear: 23337
Samples for menswear 6938
samples for sport 1892


In [13]:
#no sampling for "Sport"
sport_df_train = articles_df[articles_df["index_group_name"] == "Sport"].sample(n=int(0.03 * training_split))

baby_children_df_train = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=int(0.32*training_split))

divided_df_train = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=int(0.14*training_split))

ladieswear_df_train = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=int(0.37*training_split))

menswear_df_train = articles_df[articles_df["index_group_name"] == "Menswear"].sample(n=int(0.11 * training_split))




train_df = pd.concat([sport_df_train,
                        baby_children_df_train,
                        divided_df_train,
                        ladieswear_df_train,
                        menswear_df_train], axis=0)

In [14]:
train_df.head(10)

Unnamed: 0,article_id,index_group_name,detail_desc
89559,838825002,Sport,"Cropped, sleeveless sports top in fast-drying ..."
101107,894355001,Sport,Fully lined sports bra in fast-drying function...
100370,889816008,Sport,Sports tights in fast-drying functional fabric...
80150,801092002,Sport,Longjohns with an elasticated waist and flatlo...
17291,573452003,Sport,Sports bra in fast-drying functional fabric wi...
103673,914453001,Sport,Ankle-length sports tights in fast-drying func...
88392,832723003,Sport,Sports hoodie in sturdy jersey with a print mo...
35313,646429001,Sport,Sports tights in fast-drying functional fabric...
9488,533404042,Sport,Sporty jacket in sturdy jersey with a zip down...
2756,426609042,Sport,"Short-sleeved, patterned sports top in fast-dr..."


In [15]:
articles_df = articles_df.drop(train_df.index)

In [16]:
len(articles_df)

43945

In [17]:
train_df.reset_index()

Unnamed: 0,index,article_id,index_group_name,detail_desc
0,89559,838825002,Sport,"Cropped, sleeveless sports top in fast-drying ..."
1,101107,894355001,Sport,Fully lined sports bra in fast-drying function...
2,100370,889816008,Sport,Sports tights in fast-drying functional fabric...
3,80150,801092002,Sport,Longjohns with an elasticated waist and flatlo...
4,17291,573452003,Sport,Sports bra in fast-drying functional fabric wi...
...,...,...,...,...
61176,62093,735123004,Menswear,Jacket in woven fabric with a concealed two-wa...
61177,75712,782616025,Menswear,T-shirt in soft cotton jersey.
61178,46178,685600014,Menswear,Swim shorts in woven fabric with an elasticate...
61179,43226,675853002,Menswear,NICK VEASEY x H&M. Long-sleeved top in printed...


In [18]:
val_split = int(len(articles_df)/2)

print("Val split", val_split)
print("Samples for Baby/Child:", int(0.32*val_split))
print("Samples for Divided:", int(0.14*val_split))
print("Samples for Ladieswear:", int(0.37*val_split))
print("Samples for menswear", int(0.11 * val_split))
print("samples for sport", int(0.03 * val_split))

Val split 21972
Samples for Baby/Child: 7031
Samples for Divided: 3076
Samples for Ladieswear: 8129
Samples for menswear 2416
samples for sport 659


In [19]:
sport_df_val = articles_df[articles_df["index_group_name"] == "Sport"].sample(n=int(0.03 * val_split))

baby_children_df_val = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=int(0.32*val_split))

divided_df_val = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=int(0.14*val_split))

ladieswear_df_val = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=int(0.37*val_split))

menswear_df_val = articles_df[articles_df["index_group_name"] == "Menswear"].sample(n=int(0.11 * val_split))


val_df = pd.concat([sport_df_val,
                        baby_children_df_val,
                        divided_df_val,
                        ladieswear_df_val,
                        menswear_df_val], axis=0)

In [20]:
test_df = articles_df.drop(val_df.index)

In [21]:
val_df.reset_index()
test_df.reset_index()

Unnamed: 0,index,article_id,index_group_name,detail_desc
0,0,108775015,Ladieswear,Jersey top with narrow shoulder straps.
1,4,110065002,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."
2,6,111565001,Ladieswear,"Semi shiny nylon stockings with a wide, reinfo..."
3,8,111586001,Ladieswear,Tights with built-in support to lift the botto...
4,12,112679052,Baby/Children,Sweatshirt in soft organic cotton with a pres...
...,...,...,...,...
22629,105533,950449002,Baby/Children,"Small, folding hair brush with a rhinestone-de..."
22630,105535,952937003,Ladieswear,"Fitted, calf-length dress in viscose jersey wi..."
22631,105536,952938001,Ladieswear,Fitted top in jersey with a round neckline and...
22632,105537,953450001,Menswear,Socks in a fine-knit cotton blend with a small...


In [22]:
len(val_df) + len(test_df) + len(train_df)

105126

In [23]:
def get_image_path(balanced_df):
    final_rows = []
    mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")

    for (_, row) in balanced_df.iterrows():
        location = "0" + str(row["article_id"])[:2] + "/" + "0" + str(row["article_id"]) + ".jpg"
        path = os.path.join(mypath, *location.split("/"))
        if os.path.isfile(path):
            final_rows.append([row["article_id"], row["index_group_name"], row["detail_desc"], location])
    
    return final_rows

In [24]:
train_rows = get_image_path(train_df)
val_rows = get_image_path(val_df)
test_rows = get_image_path(test_df)

# final_df = pd.DataFrame(final_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )

In [25]:
len(train_rows) + len(val_rows) + len(test_rows)

104696

In [26]:
train_final = pd.DataFrame(train_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"])
val_final = pd.DataFrame(val_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )
test_final = pd.DataFrame(test_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )

In [31]:

import shutil

def write_images(final_df, tag=""):
    image_paths = final_df["location"].tolist()
    dest_folder = tag + "_images"
    dest_path = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", dest_folder)
    os.mkdir(dest_path)

    mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")
    newsize = (256, 256)
    for image in image_paths:
        img_path = os.path.join(mypath, *image.split("/"))
        img = Image.open(img_path)
        img = img.resize(newsize)
        dest_loc = os.path.join(dest_path, image.split("/")[1])
        img.save(dest_loc)


In [42]:
import numpy as np

In [58]:
train_img = "val_images"

bands = []
band_one = []
for name in os.listdir(os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", train_img)):
    img = Image.open((os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", train_img, name)))
    bands.append(len(img.getbands()))
    if len(img.getbands()) != 3:
        band_one.append(name)
        

In [59]:
band_one

[]

'0616100001.jpg'

In [28]:
train_final[train_final['article_id'] == '0616100001.jpg']

Unnamed: 0,article_id,index_group_name,detail_desc,location


In [32]:
write_images(train_final, "train")

In [33]:
write_imagesages(val_final, "val")

In [34]:
write_images(test_final, "test")

In [29]:
# export dataframe to csv
train_final.to_csv("train_data.csv")
val_final.to_csv("val_data.csv")
test_final.to_csv("test_data.csv")