In [1]:
import os
import pandas as pd
from src import _PATH_DATA

In [2]:
DATASET_PATH = os.path.join(_PATH_DATA,"bugnist_256")

folders = [name for name in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, name))]
files = [f"{folder}/{file}" for folder in folders for file in os.listdir(f"{DATASET_PATH}/{folder}")]
# write files as csv
with open(f"{DATASET_PATH}/files.csv", "w") as f:
    f.write("files\n")
    f.write("\n".join(files))

In [3]:
CSV_PATH = "../data/bugnist_256/files.csv"
VALIDATION_PATH = "../data/bugnist_256/validation.csv"
TRAIN_PATH = "../data/bugnist_256/train.csv"
TEST_PATH = "../data/bugnist_256/test.csv"
SAMPLE_PERCENTAGE = 0.20

# SEED = 9000 + 4206969420
SEED = 8963514

# Avoid accidentally overwriting data
error: str = ""
if os.path.exists(VALIDATION_PATH):
    error = f"{VALIDATION_PATH} already exists. "
if os.path.exists(TRAIN_PATH):
    error += f"{TRAIN_PATH} already exists. "
    
if os.path.exists(TEST_PATH):
    error += f"{TEST_PATH} already exists. "

if not os.path.exists(CSV_PATH):
    error += f"CSV path {CSV_PATH} does not exist. "

if SAMPLE_PERCENTAGE < 0 or SAMPLE_PERCENTAGE > 1:
    error += f"Sample percentage {SAMPLE_PERCENTAGE} is not between 0 and 1. "

if error:
    raise ValueError(error.strip())

original_csv = pd.read_csv(CSV_PATH)

labels = original_csv["files"].map(lambda x: x[0:2])
grouped = original_csv.groupby(labels.values)
test_subset = grouped.sample(frac=SAMPLE_PERCENTAGE, random_state=SEED)

test_subset.to_csv(TEST_PATH, index=False)

difference = original_csv.index.difference(test_subset.index)
original_without_test_subset = original_csv.loc[difference]


labels = original_without_test_subset["files"].map(lambda x: x[0:2])
grouped = original_without_test_subset.groupby(labels.values)
val_subset = grouped.sample(frac=SAMPLE_PERCENTAGE, random_state=SEED)

val_subset.to_csv(VALIDATION_PATH, index=False)

difference = original_without_test_subset.index.difference(val_subset.index)
train_subset = original_without_test_subset.loc[difference]

train_subset.to_csv(TRAIN_PATH, index=False)

In [1]:
import torch
from glob import glob
from src import _PATH_DATA
import numpy as np
import pandas as pd

In [2]:
img_paths = glob("noisy_mixes_256/mix*.tif",root_dir=_PATH_DATA)
label_paths = glob("noisy_mixes_256/label*.tif",root_dir=_PATH_DATA)

img_paths.sort()
label_paths.sort()

np.random.seed(42)
train_set = np.random.choice(10000,size=7000, replace=False)
temp = np.arange(10000)
temp = np.delete(temp,train_set)
vali_set = np.random.choice(temp,size=1500,replace=False)
test_set = np.array([i for i in temp if not np.isin(vali_set,i).any()])

df_train = pd.DataFrame(data={"image_path":np.array(img_paths)[train_set],"label_path":np.array(label_paths)[train_set]})
df_vali = pd.DataFrame(data={"image_path":np.array(img_paths)[vali_set],"label_path":np.array(label_paths)[vali_set]})
df_test = pd.DataFrame(data={"image_path":np.array(img_paths)[test_set],"label_path":np.array(label_paths)[test_set]})


df_train.to_csv(_PATH_DATA+"/train_noisy_mix.csv", index=False)
df_vali.to_csv(_PATH_DATA+"/validation_noisy_mix.csv", index=False)
df_test.to_csv(_PATH_DATA+"/test_noisy_mix.csv", index=False)

In [None]:
import pandas as pd
from src import _PATH_DATA

In [None]:
df1 = pd.read_csv(_PATH_DATA+"/train_noisy_mix.csv")
df2 = pd.read_csv(_PATH_DATA+"/bugnist_512/train.csv")
df3 = pd.read_csv(_PATH_DATA+"/bugnist_256/train.csv")

df2 = df2.rename(columns={"files":"image_path"})
df2["label_path"] = ["none"]*len(df2)
for i in range(len(df2)):
    df2.image_path[i] = "bugnist_512/"+df2.image_path[i]

df3 = df3.rename(columns={"files":"image_path"})
df3["label_path"] = ["none"]*len(df3)
for i in range(len(df3)):
    df3.image_path[i] = "bugnist_256/"+df3.image_path[i]

df = pd.concat([df1,df2,df3],ignore_index=True)
df.to_csv("../data/train.csv",index=False)

In [None]:
df1 = pd.read_csv(_PATH_DATA+"/validation_noisy_mix.csv")
df2 = pd.read_csv(_PATH_DATA+"/bugnist_512/validation.csv")
df3 = pd.read_csv(_PATH_DATA+"/bugnist_256/validation.csv")

df2 = df2.rename(columns={"files":"image_path"})
df2["label_path"] = ["none"]*len(df2)
for i in range(len(df2)):
    df2.image_path[i] = "bugnist_512/"+df2.image_path[i]

df3 = df3.rename(columns={"files":"image_path"})
df3["label_path"] = ["none"]*len(df3)
for i in range(len(df3)):
    df3.image_path[i] = "bugnist_256/"+df3.image_path[i]

df = pd.concat([df1,df2,df3],ignore_index=True)
df.to_csv("../data/validation.csv",index=False)

In [None]:
df1 = pd.read_csv(_PATH_DATA+"/test_noisy_mix.csv")
df2 = pd.read_csv(_PATH_DATA+"/bugnist_512/test.csv")
df3 = pd.read_csv(_PATH_DATA+"/bugnist_256/test.csv")

df2 = df2.rename(columns={"files":"image_path"})
df2["label_path"] = ["none"]*len(df2)
for i in range(len(df2)):
    df2.image_path[i] = "bugnist_512/"+df2.image_path[i]

df3 = df3.rename(columns={"files":"image_path"})
df3["label_path"] = ["none"]*len(df3)
for i in range(len(df3)):
    df3.image_path[i] = "bugnist_256/"+df3.image_path[i]

df = pd.concat([df1,df2,df3],ignore_index=True)
df.to_csv("../data/test.csv",index=False)