In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'Colab\ Notebooks'

%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [2]:
import os
import glob

import pandas as pd
from PIL import Image
from tqdm import tqdm
from tqdm import trange

In [3]:
data_root = "./Dataset"

files = glob.glob(os.path.join(data_root, "*/*.jpg"))
print(f"{len(files)} images")

11243 images


In [4]:
df = []

for f in tqdm(files):
    cat = f.split('/')[-2]
    try: 
        img = Image.open(f)
        w, h = img.size

        df.append(
            dict(
                img_path=f,
                cat=cat,
                w=w,
                h=h,
                max_dim=max(w, h)
            )
        )
    except:
        continue
    
df = pd.DataFrame(df)
df

100%|██████████| 11243/11243 [30:22<00:00,  6.17it/s]


Unnamed: 0,img_path,cat,w,h,max_dim
0,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,767,1024,1024
1,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,1024,768,1024
2,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,768,1024,1024
3,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,682,1024,1024
4,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,768,1024,1024
...,...,...,...,...,...
11236,./Dataset/Nephrolepis_cordifolia/2271155729_47...,Nephrolepis_cordifolia,1024,768,1024
11237,./Dataset/Nephrolepis_cordifolia/23200175311_f...,Nephrolepis_cordifolia,1024,1024,1024
11238,./Dataset/Nephrolepis_cordifolia/2263518760_6f...,Nephrolepis_cordifolia,1024,768,1024
11239,./Dataset/Nephrolepis_cordifolia/9049930900_be...,Nephrolepis_cordifolia,1024,767,1024


In [5]:
val_frac = 0.05
test_frac = 0.05

df["subset"] = "train"
df.loc[df.query("subset == 'train'").sample(frac=val_frac).index, "subset"] = "val"
df.loc[df.query("subset == 'train'").sample(frac=val_frac).index, "subset"] = "test"
df.head()

Unnamed: 0,img_path,cat,w,h,max_dim,subset
0,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,767,1024,1024,test
1,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,1024,768,1024,train
2,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,768,1024,1024,train
3,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,682,1024,1024,train
4,./Dataset/Dracaena_trifasciata/resize_Dracaena...,Dracaena_trifasciata,768,1024,1024,train


In [6]:
# df.to_csv('sample_lst.csv', index=False)

In [4]:
df = pd.read_csv('sample_lst.csv')

In [7]:
df['cat'].value_counts().reset_index().sort_values('cat', ascending=False)

Unnamed: 0,index,cat
0,Hoya_carnosa,1295
1,Monstera_deliciosa,996
2,Asplenium_nidus,885
3,Tradescantia_zebrina,811
4,Zamioculcas,766
5,Epipremnum_aureum,728
6,Aglaonema_commutatum,669
7,Dypsis_lutescens,583
8,Rhapis_excelsa,513
9,Maranta_leuconeura,497


In [8]:
pd.pivot_table(df, index='cat', columns='subset', values='img_path', aggfunc = 'count').sort_values('train', ascending=False)

subset,test,train,val
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hoya_carnosa,56,1157,82
Monstera_deliciosa,44,906,46
Asplenium_nidus,39,806,40
Tradescantia_zebrina,35,733,43
Zamioculcas,54,661,51
Epipremnum_aureum,31,658,39
Aglaonema_commutatum,27,605,37
Dypsis_lutescens,22,541,20
Rhapis_excelsa,22,463,28
Maranta_leuconeura,31,444,22


In [6]:
export_dst = "./Final_data"


for i in trange(len(df)):
    try: 
        image_path, cat, subset = df.iloc[i][["img_path", "cat", "subset"]]
        img = Image.open(image_path)
        w, h = img.size
        os.makedirs(os.path.join(export_dst, subset, cat), exist_ok=True)

        if w > h:
            w_dst = 1024
            h_dst = int(h / w * 1024)
        else:
            h_dst = 1024
            w_dst = int(w / h * 1024)


        img = img.resize((w_dst, h_dst), resample=Image.BILINEAR)
        img.save(os.path.join(export_dst, subset, cat, os.path.basename(image_path)))
        
        
    except:
        continue

100%|██████████| 11241/11241 [16:33<00:00, 11.32it/s]
