In [None]:
import pandas as pd
import numpy as np
import os

from glob import glob
from shutil import copyfile
from tqdm import tqdm

from code_base.constants import CLASSES

# Data

## V1

In [None]:
df_spleen = pd.read_csv("data/hpa/hpa_add_spleen/ensem_082.csv")
df_lungs = pd.read_csv("data/hpa/hpa_add_lungs/ensem_082.csv")
df_corrected_lungs = pd.read_csv("data/hpa/hpa_corrected_lungs/ensem_082.csv")

dups = pd.read_csv("data/hpa/train_with_lungs_prostate_spleen_dups.csv")

In [None]:
dup_ids = [os.path.splitext(el)[0] for el in dups.loc[~dups.duplicates.isna(), "duplicates"].to_list()]

df_spleen = df_spleen[~df_spleen.id.isin(dup_ids)].reset_index(drop=True)
df_lungs = df_lungs[~df_lungs.id.isin(dup_ids)].reset_index(drop=True)

df_spleen["id"] = df_spleen["id"].astype(str) + ".jpg"
df_lungs["id"] = df_lungs["id"].astype(str) + ".jpg"
df_corrected_lungs["id"] = df_corrected_lungs["id"].astype(str) + ".tiff"

In [None]:
all_pseudo_images_pathes = (
    glob("data/hpa/hpa_add_spleen/spleen_images/*.jpg") + 
    glob("data/hpa/hpa_add_lungs/lung_images/*.jpg") +
    glob("data/hpa/hpa_corrected_lungs/images/*.tiff")
)
assert len(all_pseudo_images_pathes) == len(set([os.path.basename(el) for el in all_pseudo_images_pathes]))
print(f"{len(all_pseudo_images_pathes)} images found in the folders")

In [None]:
new_root = "data/hpa/hpa_add/images"
os.makedirs(new_root)
for path in tqdm(all_pseudo_images_pathes):
    copyfile(
        path,
        os.path.join(new_root, os.path.basename(path))
    )

In [None]:
df_spleen = df_spleen[~df_spleen.rle.isna()].reset_index(drop=True)
df_lungs = df_lungs[~df_lungs.rle.isna()].reset_index(drop=True)
df_corrected_lungs = df_corrected_lungs[~df_corrected_lungs.rle.isna()].reset_index(drop=True)

pseudo_df = pd.concat([df_spleen, df_lungs, df_corrected_lungs]).reset_index(drop=True)

In [None]:
pseudo_df

In [None]:
pseudo_df

In [None]:
pseudo_df.to_csv("data/hpa/hpa_add/v1.csv", index=False)

## V2

In [None]:
pseudo_df = pd.read_csv("data/hpa/hpa_add/v1.csv")
pseudo_df

In [None]:
CLASSES

In [None]:
df_largeintestine = pd.read_csv("data/hpa/hpa_add_colon/ensem_082.csv")
df_kidney = pd.read_csv("data/hpa/hpa_add_kidney/ensem_082.csv")
df_prostate = pd.read_csv("data/hpa/hpa_add_prostate/ensem_082.csv")

dups = pd.read_csv("train_with_lungs_prostate_spleen_colon_kidney_dups.csv")

In [None]:
dup_ids = [os.path.splitext(el)[0] for el in dups.loc[~dups.duplicates.isna(), "duplicates"].to_list()]

In [None]:
df_largeintestine = df_largeintestine[~df_largeintestine.id.isin(dup_ids)].reset_index(drop=True)
df_largeintestine = df_largeintestine[~(df_largeintestine.rle.isna() | (df_largeintestine.rle == ""))].reset_index(drop=True)

df_kidney = df_kidney[~df_kidney.id.isin(dup_ids)].reset_index(drop=True)
df_kidney = df_kidney[~(df_kidney.rle.isna() | (df_kidney.rle == ""))].reset_index(drop=True)

df_prostate = df_prostate[~df_prostate.id.isin(dup_ids)].reset_index(drop=True)
df_prostate = df_prostate[~(df_prostate.rle.isna() | (df_prostate.rle == ""))].reset_index(drop=True)

df_largeintestine["id"] = df_largeintestine["id"].astype(str) + ".jpg"
df_kidney["id"] = df_kidney["id"].astype(str) + ".jpg"
df_prostate["id"] = df_prostate["id"].astype(str) + ".jpg"

In [None]:
all_pseudo_images_pathes = (
    glob("data/hpa/hpa_add_colon/colon_images/*.jpg") + 
    glob("data/hpa/hpa_add_kidney/kidney_images/*.jpg") +
    glob("data/hpa/hpa_add_prostate/prostate_hpa/prostate_images/*.jpg")
)
assert len(all_pseudo_images_pathes) == len(set([os.path.basename(el) for el in all_pseudo_images_pathes]))
print(f"{len(all_pseudo_images_pathes)} images found in the folders")

In [None]:
new_root = "data/hpa/hpa_add/v2/"
for path in tqdm(all_pseudo_images_pathes):
    copyfile(
        path,
        os.path.join(new_root, os.path.basename(path))
    )

In [None]:
pseudo_df = pd.concat([pseudo_df, df_largeintestine, df_kidney, df_prostate]).reset_index(drop=True)

In [None]:
pseudo_df.to_csv("data/hpa/hpa_add/v2.csv", index=False)

# V3 (and v3 full)

In [None]:
df_largeintestine = pd.read_csv("data/hpa/hpa_add_colon/ensem_083_full.csv")
df_kidney = pd.read_csv("data/hpa/hpa_add_kidney/ensem_083.csv")
df_prostate = pd.read_csv("data/hpa/hpa_add_prostate/ensem_083.csv")
df_spleen = pd.read_csv("data/hpa/hpa_add_spleen/ensem_083.csv")
df_lungs = pd.read_csv("data/hpa/hpa_add_lungs/ensem_083.csv")
df_corrected_lungs = pd.read_csv("data/hpa/hpa_corrected_lungs/ensem_083.csv")

dups = pd.read_csv("data/hpa/train_with_lungs_prostate_spleen_colon_kidney_dups.csv")
dup_ids = [os.path.splitext(el)[0] for el in dups.loc[~dups.duplicates.isna(), "duplicates"].to_list()]

In [None]:
df_largeintestine = df_largeintestine[~df_largeintestine.id.isin(dup_ids)].reset_index(drop=True)
df_largeintestine = df_largeintestine[~(df_largeintestine.rle.isna() | (df_largeintestine.rle == ""))].reset_index(drop=True)

df_kidney = df_kidney[~df_kidney.id.isin(dup_ids)].reset_index(drop=True)
df_kidney = df_kidney[~(df_kidney.rle.isna() | (df_kidney.rle == ""))].reset_index(drop=True)

df_prostate = df_prostate[~df_prostate.id.isin(dup_ids)].reset_index(drop=True)
df_prostate = df_prostate[~(df_prostate.rle.isna() | (df_prostate.rle == ""))].reset_index(drop=True)

df_spleen = df_spleen[~df_spleen.id.isin(dup_ids)].reset_index(drop=True)
df_spleen = df_spleen[~(df_spleen.rle.isna() | (df_spleen.rle == ""))].reset_index(drop=True)

df_lungs = df_lungs[~df_lungs.id.isin(dup_ids)].reset_index(drop=True)
df_lungs = df_lungs[~(df_lungs.rle.isna() | (df_lungs.rle == ""))].reset_index(drop=True)

df_corrected_lungs = df_corrected_lungs[~df_corrected_lungs.id.isin(dup_ids)].reset_index(drop=True)
df_corrected_lungs = df_corrected_lungs[~(df_corrected_lungs.rle.isna() | (df_corrected_lungs.rle == ""))].reset_index(drop=True)

df_largeintestine["id"] = df_largeintestine["id"].astype(str) + ".jpg"
df_kidney["id"] = df_kidney["id"].astype(str) + ".jpg"
df_prostate["id"] = df_prostate["id"].astype(str) + ".jpg"
df_spleen["id"] = df_spleen["id"].astype(str) + ".jpg"
df_lungs["id"] = df_lungs["id"].astype(str) + ".jpg"
df_corrected_lungs["id"] = df_corrected_lungs["id"].astype(str) + ".tiff"

In [None]:
pseudo_df = pd.concat([
    df_largeintestine, 
    df_kidney, 
    df_prostate, 
    df_spleen,
    df_lungs,
    df_corrected_lungs
]).reset_index(drop=True)

In [None]:
pseudo_df.shape

In [None]:
pseudo_df.to_csv("data/hpa/hpa_add/v3_full.csv", index=False)