In [None]:
import os, warnings
import wandb

import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold

import params
warnings.filterwarnings('ignore')

In [None]:
run = wandb.init(project=params.WANDB_PROJECT, job_type="data_split")

In [None]:
raw_data_at = run.use_artifact(f'{params.RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

In [None]:
path.ls()

To split data between training, testing and validation, we need file names. We previously saved these columns to EDA table, so let's retrieve it from the table now.

In [None]:
fnames = get_image_files(path)

In [None]:
# Filter out images from unwanted_folder
fnames = list(filter(lambda x: "media" not in str(x), fnames))

In [None]:
len(fnames)

In [None]:
orig_eda_table = raw_data_at.get("eda_table3")

In [None]:
# Get a list of all subfolders (which will be the labels)
labels = params.BDD_CLASSES
labels

In [None]:
# Convert the list of file paths to a pandas DataFrame with the appropriate columns
df = pd.DataFrame({'File_Name': [str(f.relative_to(f.parent.parent)) for f in fnames]})
df['Label'] = df['File_Name'].apply(lambda x: str(x).split('/')[0])

num_duplicates = len(df) - len(df['File_Name'].unique())
if num_duplicates > 0:
    print(f"Warning: {num_duplicates} duplicate file names found in the DataFrame.")

# Print the value counts for each label
print(df['Label'].value_counts())

In [None]:
# Reset the index of the DataFrame
df = df.reset_index(drop=True)

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for i, (train_idxs, test_idxs) in enumerate(cv.split(df['File_Name'], df['Label'], groups=None)):
    df.loc[test_idxs, 'Fold'] = i

df['Stage'] = 'train'
df.loc[df.Fold == 0, 'Stage'] = 'test'
df.loc[df.Fold == 1, 'Stage'] = 'valid'
df.drop(columns=['Fold'], inplace=True)

df.Stage.value_counts()

In [None]:
df.to_csv('data_split.csv', index=False)

We will now create a new artifact and add our data there.

In [None]:
processed_data_at = wandb.Artifact('data_split', type="split_data")

In [None]:
processed_data_at.add_file('data_split.csv')
processed_data_at.add_dir(path)

Finally, the split information may be relevant for our analyses - rather than uploading images again, we will save the split information to a new table and join it with EDA table we created previously.

In [None]:
data_split_table = wandb.Table(dataframe=df[['File_Name', 'Stage']])

In [None]:
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "File_Name")

Let's add it to our artifact, log it and finish our run.

In [None]:
processed_data_at.add(join_table, "eda_table_data_split")

In [None]:
run.log_artifact(processed_data_at)
run.finish()