#### Since the data is highly imbalanced, we take 11,000 samples out of each category.
#### The categories being: 
##### 1) Baby/Children
##### 2) Divided
##### 3) Ladieswear
##### 4) Menswear
##### 5) Sport

##### We also remove samples which don't have detail_desc

##### This data will be used to train the image and text embeddings

In [26]:
import os
import pandas as pd
from PIL import Image

In [2]:
articles_df = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/articles.csv")

In [3]:
len(articles_df)

105542

In [4]:
articles_df = articles_df[["article_id", "index_group_name", "detail_desc"]]

In [5]:
articles_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,108775015,Ladieswear,Jersey top with narrow shoulder straps.
1,108775044,Ladieswear,Jersey top with narrow shoulder straps.
2,108775051,Ladieswear,Jersey top with narrow shoulder straps.
3,110065001,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."


In [6]:
articles_df["index_group_name"].unique()

array(['Ladieswear', 'Baby/Children', 'Menswear', 'Sport', 'Divided'],
      dtype=object)

In [7]:
index_group = articles_df.groupby("index_group_name")

In [8]:
for name, group in index_group:
    print('name:', name, "|", "size:", len(group))

name: Baby/Children | size: 34711
name: Divided | size: 15149
name: Ladieswear | size: 39737
name: Menswear | size: 12553
name: Sport | size: 3392


In [9]:
#no sampling for "Sport"
sport_df = articles_df[articles_df["index_group_name"] == "Sport"]

baby_children_df = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=15000)

divided_df = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=15000)

ladieswear_df = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=15000)

menswear_df = articles_df[articles_df["index_group_name"] == "Menswear"]

balanced_df = pd.concat([sport_df,
                        baby_children_df,
                        divided_df,
                        ladieswear_df,
                        menswear_df], axis=0, ignore_index=True)

In [10]:
balanced_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,145872001,Sport,"Long-sleeved sports top in fast-drying, breath..."
1,145872037,Sport,"Long-sleeved sports top in fast-drying, breath..."
2,145872043,Sport,"Long-sleeved sports top in fast-drying, breath..."
3,145872051,Sport,"Long-sleeved sports top in fast-drying, breath..."
4,145872052,Sport,"Long-sleeved sports top in fast-drying, breath..."


In [11]:
len(balanced_df)

60945

In [12]:
## some of the rows are missing "detail_desc"
balanced_df["detail_desc"].isna().unique()

array([False,  True])

In [13]:
balanced_df.dropna(subset=['detail_desc'], how='all', inplace=True)

In [14]:
## missing values have been removed
balanced_df["detail_desc"].isna().unique()

array([False])

In [15]:
len(balanced_df)

60724

In [16]:
final_rows = []
mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")

for (_, row) in balanced_df.iterrows():
    location = "0" + str(row["article_id"])[:2] + "/" + "0" + str(row["article_id"]) + ".jpg"
    path = os.path.join(mypath, *location.split("/"))
    if os.path.isfile(path):
        final_rows.append([row["article_id"], row["index_group_name"], row["detail_desc"], location])
    

In [17]:
final_df = pd.DataFrame(final_rows, columns = ["article_id", "index_group_name", "detail_desc", "location"] )

In [18]:
len(final_df)

60478

In [23]:
# shutil.rmtree("balanced_images")

In [27]:
image_paths = final_df["location"].tolist()
image_paths[0]

'014/0145872001.jpg'

In [29]:

import shutil

image_paths = final_df["location"].tolist()
dest_path = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "balanced_images_crop")
os.mkdir(dest_path)

mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")
newsize = (256, 256)
for image in image_paths:
    img_path = os.path.join(mypath, *image.split("/"))
    img = Image.open(img_path)
    img = img.resize(newsize)
    dest_loc = os.path.join(dest_path, image.split("/")[1])
    img.save(dest_loc)
#     shutil.copy(img_path, dest_loc)


In [25]:
# export dataframe to csv
final_df.to_csv("balanced_data.csv")