This notebook uses the `raw.csv` file to sample a fixed number of rows from the dataset. It saves all the images in one folder and creates a new csv file with the sampled rows. It samples uniformly accross views and subjects. It creates a `frames.csv` file with the sampled rows and a `frames` with the images.

In [6]:
import pandas as pd
import tqdm
import os
import shutil

In [7]:
df = pd.read_csv('../data/raw.csv')

Explore number of items in the dataset and the number of unique subjects and views.

In [8]:
total_num = len(df)

# get total unique values in C column
num_C = df['C'].nunique()
num_S = df['S'].nunique()
num_A = df['A'].nunique()
num_D = df['D'].nunique()

# get number per C
num_per_C = df.groupby('C').size()

# for each C get numbers per S
num_per_C_S = df.groupby(['C', 'S']).size()

print(num_per_C_S)

C  S 
1  1     16775
   2      4827
   7     14269
   8      7655
   9     10224
   21    19964
2  1     13960
   2      4919
   7      7315
   8      7118
   9      6165
   21    19840
3  1     13563
   2      6398
   7     15033
   8     11791
   9      9773
   21    19913
4  1     14666
   2      9962
   7     13052
   8      8738
   9      2940
   21    20026
   22    14711
   23    21014
   24    16583
   25    18855
   26    16426
   27    21037
   28    15186
5  1     12992
   2      4527
   7      9076
   8     11086
   9      7615
   21    19988
6  1     14412
   2      7100
   7      5605
   8      6250
   9      9723
   21    20215
dtype: int64


Sample

In [13]:
N = 72000 # number of samples
N_per_C = N // num_C
subjects = [1,2,7,8,9,21]
num_S = len(subjects)
N_per_S_per_C = N_per_C // num_S + 1

new_df = pd.DataFrame()

for c in range(1, num_C+1):
    for s in subjects:
        subset = df[(df['C'] == c) & (df['S'] == s)]
        # get random sample
        sample = subset.sample(n=N_per_S_per_C)
        new_df = pd.concat([new_df, sample])

new_df = new_df[:N]
new_df = new_df.sample(frac=1)
print(len(new_df))

72000


Move images and update names

In [14]:
root = "E:\data_processed"
save_dir = "../data/frames"

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for i, row in tqdm.tqdm(new_df.iterrows(), total=len(new_df)):
    img_path = row['img_path']
    img_path = os.path.join(root, img_path)
    img_name = os.path.basename(img_path)
    new_img_path = os.path.join(save_dir, img_name)
    shutil.copy(img_path, new_img_path)

    new_df.at[i, 'img_path'] = img_name

new_df.to_csv('../data/frames.csv', index=False)

100%|██████████| 72000/72000 [52:03<00:00, 23.05it/s]  
