In [1]:
import os, warnings
import wandb

import pandas as pd
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold

import params
warnings.filterwarnings('ignore')

In [2]:
run = wandb.init(project=params.WANDB_PROJECT, job_type="data_split")

[34m[1mwandb[0m: Currently logged in as: [33msolab5[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
raw_data_at = run.use_artifact(f'{params.RAW_DATA_AT}:latest')
path = Path(raw_data_at.download())

[34m[1mwandb[0m: Downloading large artifact my_images:latest, 2268.98MB. 36307 files... 
[34m[1mwandb[0m:   36307 of 36307 files downloaded.  
Done. 0:0:43.1


In [4]:
path.ls()

(#12) [Path('artifacts/my_images:v2/Tomato_Yellow_Leaf_Curl_Virus'),Path('artifacts/my_images:v2/healthy'),Path('artifacts/my_images:v2/Late_blight'),Path('artifacts/my_images:v2/Septoria_leaf_spot'),Path('artifacts/my_images:v2/Leaf_Mold'),Path('artifacts/my_images:v2/Spider_mites'),Path('artifacts/my_images:v2/Tomato_mosaic_virus'),Path('artifacts/my_images:v2/eda_table3.table.json'),Path('artifacts/my_images:v2/Early_blight'),Path('artifacts/my_images:v2/Bacterial_spot')...]

To split data between training, testing and validation, we need file names. We previously saved these columns to EDA table, so let's retrieve it from the table now.

In [5]:
fnames = get_image_files(path)

In [7]:
# Filter out images from unwanted_folder
fnames = list(filter(lambda x: "media" not in str(x), fnames))

In [10]:
len(fnames)

18160

In [11]:
orig_eda_table = raw_data_at.get("eda_table3")

[34m[1mwandb[0m: Downloading large artifact my_images:latest, 2268.98MB. 36307 files... 
[34m[1mwandb[0m:   36307 of 36307 files downloaded.  
Done. 0:0:8.0


In [12]:
# Get a list of all subfolders (which will be the labels)
labels = params.BDD_CLASSES
labels

{0: ['Bacterial_spot',
  'Early_blight',
  'Late_blight',
  'Leaf_Mold',
  'Septoria_leaf_spot',
  'Spider_mites',
  'Target_Spot',
  'Tomato_Yellow_Leaf_Curl_Virus',
  'Tomato_mosaic_virus',
  'healthy']}

In [18]:
# Convert the list of file paths to a pandas DataFrame with the appropriate columns
df = pd.DataFrame({'File_Name': [str(f.relative_to(f.parent.parent)) for f in fnames]})
df['Label'] = df['File_Name'].apply(lambda x: str(x).split('/')[0])

num_duplicates = len(df) - len(df['File_Name'].unique())
if num_duplicates > 0:
    print(f"Warning: {num_duplicates} duplicate file names found in the DataFrame.")

# Print the value counts for each label
print(df['Label'].value_counts())

Tomato_Yellow_Leaf_Curl_Virus    5357
Bacterial_spot                   2127
Late_blight                      1909
Septoria_leaf_spot               1771
Spider_mites                     1676
healthy                          1591
Target_Spot                      1404
Early_blight                     1000
Leaf_Mold                         952
Tomato_mosaic_virus               373
Name: Label, dtype: int64


In [21]:
# Reset the index of the DataFrame
df = df.reset_index(drop=True)

In [22]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for i, (train_idxs, test_idxs) in enumerate(cv.split(df['File_Name'], df['Label'], groups=None)):
    df.loc[test_idxs, 'Fold'] = i

df['Stage'] = 'train'
df.loc[df.Fold == 0, 'Stage'] = 'test'
df.loc[df.Fold == 1, 'Stage'] = 'valid'
df.drop(columns=['Fold'], inplace=True)

df.Stage.value_counts()

train    14528
test      1816
valid     1816
Name: Stage, dtype: int64

In [24]:
df.to_csv('data_split.csv', index=False)

We will now create a new artifact and add our data there.

In [25]:
processed_data_at = wandb.Artifact('data_split', type="split_data")

In [26]:
processed_data_at.add_file('data_split.csv')
processed_data_at.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (./artifacts/my_images:v2)... Done. 48.7s


Finally, the split information may be relevant for our analyses - rather than uploading images again, we will save the split information to a new table and join it with EDA table we created previously.

In [27]:
data_split_table = wandb.Table(dataframe=df[['File_Name', 'Stage']])

In [28]:
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "File_Name")

Let's add it to our artifact, log it and finish our run.

In [29]:
processed_data_at.add(join_table, "eda_table_data_split")

ArtifactManifestEntry(path='eda_table_data_split.joined-table.json', digest='+wvs9fM0Ux3IcToorzvzmA==', ref=None, birth_artifact_id=None, size=128, extra={}, local_path='/root/.local/share/wandb/artifacts/staging/tmp9xooq5ms')

In [30]:
run.log_artifact(processed_data_at)
run.finish()