In [5]:
import os
import pandas as pd
from tqdm import tqdm
from shutil import copyfile as copy
    
df = pd.read_csv('Imgs.csv')
df.head()

Unnamed: 0,img_link,tags,likes,comments,img_id,img_path
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55,A00000001,Imgs/A00000001.jpg
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20,A00000002,Imgs/A00000002.jpg
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106,A00000003,Imgs/A00000003.jpg
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11,A00000004,Imgs/A00000004.jpg
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15,A00000005,Imgs/A00000005.jpg


### 1. Remove Unnecessary Columns

In [6]:
del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.head()

Unnamed: 0,tags,img_path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000001.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000002.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000003.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000004.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000005.jpg


### 2. Finding all the tags

In [9]:
t = []

for tags in df['tags']:  # Replace NaN with empty string
    t += [tag.strip() for tag in tags.split(',')]

tags = list(set(t))

### 3. Creating Folders for each Tag

In [10]:
for tag in tqdm(tags):
    
    try:
        os.mkdir('Dataset/' + tag)
    except:
        pass

100%|██████████| 940/940 [00:00<00:00, 12792.03it/s]


### 4. Saving Images in Specific Folders

In [11]:
from shutil import copy
from tqdm import tqdm

error = 0

for data in tqdm(df.values):  # Skip the first row

    tags = str(data[0])  # Convert to string to avoid 'float' error
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]  # Split and format tags
    
    src = data[1]
    
    for tag in tags:
        dst = tag + src.split('/')[-1]  # Construct destination path
        
        try:
            copy(src, dst)  # Try copying the file
        except Exception as e:  # Catch any exceptions and log the error
            error += 1
            # print(f"Error copying from {src} to {dst}: {e}")

# After the loop, print the error count
print(f"Total errors encountered: {error}")


100%|██████████| 529/529 [00:03<00:00, 158.19it/s]

Total errors encountered: 7





### 5. Checking Number of Folders

In [12]:
folders = os.listdir('Dataset')

print(len(folders))

935


### 6. Checking number of Images in Each Folder

In [13]:
folder_ = []
freq    = []

for folder in tqdm(folders):
    
    try:
        freq.append(len(os.listdir('Dataset/' + folder)))
        folder_.append(folder)

    except:
        pass

100%|██████████| 935/935 [00:00<00:00, 42676.53it/s]


### 7. Top 10 Folders with most number of Images

In [14]:
df_ = pd.DataFrame()

df_['folder'] = folder_
df_['freq']   = freq

df_.sort_values(by = 'freq', ascending = False).head(10)

Unnamed: 0,folder,freq
565,Nature,57
346,Flowers,53
73,Bird,53
344,Flower,47
23,Animal,33
588,Ornithology,26
778,Spring,25
645,Plant,23
774,Species,22
84,Bloom,21


### 8. Top 10 Folders with least number of Images

In [15]:
df_.sort_values(by = 'freq', ascending = True).head(10)

Unnamed: 0,folder,freq
466,In einem kleinen Städtchen,0
916,Wise Men,1
915,wintering in regions without snow cover and in...,1
18,America,1
17,Alpine,1
16,Alley,1
15,Alarm Clocks,1
14,Airspace,1
13,Airport,1
12,Aircraft,1


### 9. How many folders are having images more than 10

In [16]:
df_[df_['freq'] >= 10]

Unnamed: 0,folder,freq
23,Animal,33
64,Beach,12
73,Bird,53
75,Birds,10
84,Bloom,21
86,Blossom,18
97,Botany,10
148,Cat,10
173,City,10
321,Fauna,12


### 10. Removing the Folders with less than 4 Images

In [17]:
import shutil
for i in tqdm(df_[df_['freq'] < 4]['folder']):
    
    
    src = 'Dataset/' + i
    dst = 'Temp/' + i
    
    shutil.move(src, dst)    

100%|██████████| 832/832 [00:00<00:00, 1675.09it/s]


In [18]:
df = pd.read_csv('final.csv')

# del df['img_link']
# del df['img_id']
# del df['likes']
# del df['comments']

df.to_csv('final.csv', index = False)


In [19]:
df.head()


Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000000.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000001.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000002.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000003.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000004.jpg
