In [8]:
import os
import pandas as pd
import numpy as np
import shutil
from PIL import Image

## Inspecting the Data

In [3]:
ROOT_PATH = "/home/co-dutt1/rds/hpc-work/ALL_DATASETS/OXML"

df = pd.read_csv(os.path.join(ROOT_PATH, "CSVs", "labels.csv"))

#Map labels pandas
df['label'] = df['malignant'].map({-1: "Non-Detectable", 0: "Benign", 1: "Malignant"})
print(df['label'].value_counts())
df.head()

Non-Detectable    36
Benign            14
Malignant         12
Name: label, dtype: int64


Unnamed: 0,id,malignant,label
0,441,-1,Non-Detectable
1,33284,-1,Non-Detectable
2,38771,1,Malignant
3,46784,-1,Non-Detectable
4,57613,-1,Non-Detectable


In [5]:
## Make train, test, val splits and folders

from sklearn.model_selection import train_test_split

df['img_path'] = df['id'].apply(lambda x: os.path.join(ROOT_PATH, "images", "img_"+str(x)+".png"))

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

# print(train_df['label'].value_counts()), print(test_df['label'].value_counts()), print(val_df['label'].value_counts())

print(len(train_df)), print(len(test_df)), print(len(val_df))

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_df.to_csv(os.path.join(ROOT_PATH, "CSVs", "train.csv"), index=False)
test_df.to_csv(os.path.join(ROOT_PATH, "CSVs", "test.csv"), index=False)
val_df.to_csv(os.path.join(ROOT_PATH, "CSVs", "val.csv"), index=False)


39
13
10


In [6]:
from PIL import Image

img = Image.open(train_df['img_path'].iloc[4])
img.size

(368, 496)

In [None]:
# Creating Folders


# Create a new directory
train_dir = os.path.join(ROOT_PATH, 'train')
os.mkdir(train_dir)
val_dir = os.path.join(ROOT_PATH, 'val')
os.mkdir(val_dir)
test_dir = os.path.join(ROOT_PATH, 'test')
os.mkdir(test_dir)

In [None]:
for split in ['train', 'val', 'test']:
    if(split == 'train'):
        print("train")
        df = train_df
    elif(split == 'val'):
        print("val")
        df = val_df
    else:
        print("test")
        df = test_df
    
    for i in range(len(df)):
        label = df['label'][i]
        img = df['img_path'][i]

        os.makedirs(os.path.join(ROOT_PATH, split, label), exist_ok=True)

        # Copy the image
        print("Moving files")
        shutil.copy(img, os.path.join(ROOT_PATH, split, label))