In [1]:
import os
import re
import glob
import shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
RANDSTATE = 42

### Looking at Initial Train/Test/Val Percentages

In [2]:
folders = ["test", "train", "val"]
counts = []

for f in folders:
    folder_path = "chest_xray/raw/folder/*/*.jpeg"
    folder_path = folder_path.replace("folder", f)
    counts.append(len(glob.glob(folder_path)))

counts

[624, 5216, 16]

In [3]:
np.array(counts) / sum(counts)

array([0.10655738, 0.89071038, 0.00273224])

### Doing own split for custom percentages

In [4]:
folder_path = "chest_xray/raw/*/*/*.jpeg"
files = glob.glob(folder_path)
len(files)

5856

In [5]:
df = pd.DataFrame({"paths":files})
print(len(df))
df.head(2)

5856


Unnamed: 0,paths
0,chest_xray/raw\test\NORMAL\IM-0001-0001.jpeg
1,chest_xray/raw\test\NORMAL\IM-0003-0001.jpeg


In [6]:
def stratifyCol(path):
    if f"{os.sep}IM-" in path:
        return "IM"
    if f"NORMAL2-IM" in path:
        return "NORMAL2-IM"
    if f"bacteria" in path:
        return "bacteria"
    if f"virus" in path:
        return "virus"

In [7]:
df["stratify"] = df["paths"].apply(stratifyCol)

In [8]:
df

Unnamed: 0,paths,stratify
0,chest_xray/raw\test\NORMAL\IM-0001-0001.jpeg,IM
1,chest_xray/raw\test\NORMAL\IM-0003-0001.jpeg,IM
2,chest_xray/raw\test\NORMAL\IM-0005-0001.jpeg,IM
3,chest_xray/raw\test\NORMAL\IM-0006-0001.jpeg,IM
4,chest_xray/raw\test\NORMAL\IM-0007-0001.jpeg,IM
...,...,...
5851,chest_xray/raw\val\PNEUMONIA\person1949_bacter...,bacteria
5852,chest_xray/raw\val\PNEUMONIA\person1950_bacter...,bacteria
5853,chest_xray/raw\val\PNEUMONIA\person1951_bacter...,bacteria
5854,chest_xray/raw\val\PNEUMONIA\person1952_bacter...,bacteria


In [9]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df["stratify"],
    random_state=RANDSTATE
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["stratify"],
    random_state=RANDSTATE
)

train_df["split"] = "train"
test_df["split"] = "test"
val_df["split"] = "val"
combined_df = pd.concat([train_df, val_df, test_df], ignore_index=False).sort_index()

In [10]:
def update_path(path, new_split):
    newPath = re.sub(r"(raw\\)(train|val|test)", rf"\1{new_split}", path)
    newPath = newPath.replace("raw", "processed")
    return newPath

combined_df["newPath"] = combined_df.apply(lambda row: update_path(row["paths"], row["split"]), axis=1)
combined_df.head(4)

Unnamed: 0,paths,stratify,split,newPath
0,chest_xray/raw\test\NORMAL\IM-0001-0001.jpeg,IM,train,chest_xray/processed\train\NORMAL\IM-0001-0001...
1,chest_xray/raw\test\NORMAL\IM-0003-0001.jpeg,IM,val,chest_xray/processed\val\NORMAL\IM-0003-0001.jpeg
2,chest_xray/raw\test\NORMAL\IM-0005-0001.jpeg,IM,train,chest_xray/processed\train\NORMAL\IM-0005-0001...
3,chest_xray/raw\test\NORMAL\IM-0006-0001.jpeg,IM,train,chest_xray/processed\train\NORMAL\IM-0006-0001...


In [11]:
processed_root = "chest_xray/processed"

# If the folder exists, remove it completely
if os.path.exists(processed_root):
    shutil.rmtree(processed_root)

# Recreate the empty root folder
os.makedirs(processed_root, exist_ok=True)

for _, row in combined_df.iterrows():
    src = row["paths"]
    dst = row["newPath"]

    # Normalize path separators for the OS
    src = os.path.normpath(src)
    dst = os.path.normpath(dst)

    # Make sure destination directory exists
    dst_dir = os.path.dirname(dst)
    os.makedirs(dst_dir, exist_ok=True)

    # Copy the file
    shutil.copy2(src, dst)

In [12]:
saveDf = combined_df.drop(columns=["paths", "stratify"])
saveDf = saveDf.rename(columns={"newPath": "path"})
saveDf = saveDf[["path", "split"]]

In [13]:
saveDf.to_csv("fileSplit.csv", index=False)