#### Since the data is highly imbalanced, we take 11,000 samples out of each category.
#### The categories being: 
##### 1) Baby/Children
##### 2) Divided
##### 3) Ladieswear
##### 4) Menswear
##### 5) Sport

##### We also remove samples which don't have detail_desc

##### This data will be used to train the image and text embeddings

In [1]:
import os
import pandas as pd

In [2]:
articles_df = pd.read_csv("C:/Users/ojubh\/Desktop/SEMESTER 2/Deep Learning/Final_Project/data/articles.csv")

In [3]:
len(articles_df)

105542

In [4]:
articles_df = articles_df[["article_id", "index_group_name", "detail_desc"]]

In [5]:
articles_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,108775015,Ladieswear,Jersey top with narrow shoulder straps.
1,108775044,Ladieswear,Jersey top with narrow shoulder straps.
2,108775051,Ladieswear,Jersey top with narrow shoulder straps.
3,110065001,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,Ladieswear,"Microfibre T-shirt bra with underwired, moulde..."


In [6]:
articles_df["index_group_name"].unique()

array(['Ladieswear', 'Baby/Children', 'Menswear', 'Sport', 'Divided'],
      dtype=object)

In [7]:
index_group = articles_df.groupby("index_group_name")

In [8]:
for name, group in index_group:
    print('name:', name, "|", "size:", len(group))

name: Baby/Children | size: 34711
name: Divided | size: 15149
name: Ladieswear | size: 39737
name: Menswear | size: 12553
name: Sport | size: 3392


In [9]:
#no sampling for "Sport"
sport_df = articles_df[articles_df["index_group_name"] == "Sport"]

In [10]:
baby_children_df = articles_df[articles_df["index_group_name"] == "Baby/Children"].sample(n=11000)

In [11]:
divided_df = articles_df[articles_df["index_group_name"] == "Divided"].sample(n=11000)

In [12]:
ladieswear_df = articles_df[articles_df["index_group_name"] == "Ladieswear"].sample(n=11000)

In [13]:
menswear_df = articles_df[articles_df["index_group_name"] == "Menswear"].sample(n=11000)

In [14]:
balanced_df = pd.concat([sport_df,
                        baby_children_df,
                        divided_df,
                        ladieswear_df,
                        menswear_df], axis=0, ignore_index=True)

In [15]:
balanced_df.head()

Unnamed: 0,article_id,index_group_name,detail_desc
0,145872001,Sport,"Long-sleeved sports top in fast-drying, breath..."
1,145872037,Sport,"Long-sleeved sports top in fast-drying, breath..."
2,145872043,Sport,"Long-sleeved sports top in fast-drying, breath..."
3,145872051,Sport,"Long-sleeved sports top in fast-drying, breath..."
4,145872052,Sport,"Long-sleeved sports top in fast-drying, breath..."


In [16]:
len(balanced_df)

47392

In [17]:
## some of the rows are missing "detail_desc"
balanced_df["detail_desc"].isna().unique()

array([False,  True])

In [18]:
balanced_df.dropna(subset=['detail_desc'], how='all', inplace=True)

In [19]:
## missing values have been removed
balanced_df["detail_desc"].isna().unique()

array([False])

In [20]:
len(balanced_df)

47211

In [21]:
location_arr = []
for (_, row) in balanced_df.iterrows():
    location = "0" + str(row["article_id"])[:2] + "/" + "0" + str(row["article_id"]) + ".jpg"
    location_arr.append(location)
    

In [22]:
location_arr[:3]

['014/0145872001.jpg', '014/0145872037.jpg', '014/0145872043.jpg']

In [23]:
balanced_df["location"] = location_arr

In [48]:

# trial_df = balanced_df.head()
# import imageio as iio
# import shutil

# image_paths = trial_df["location"].tolist()
# dest_folder = "balanced_images"
# os.mkdir(dest_folder)

# mypath = os.path.join('c:', os.sep, 'Users', "ojubh", "Desktop", "images")

# # img = iio.imread(img_path)
# # iio.imwrite("test.jpg", img)
# i = 0
# for image in image_paths[:2]:
#     img_path = os.path.join(mypath, *image.split("/"))
#     dest_path = os.path.join(dest_folder, image.split("/")[1])
#     shutil.copy(img_path, dest_path)


In [25]:
# export dataframe to csv
balanced_df.to_csv("balanced_data.csv")