This notebook prepares a train and test dataframe from the following sources:
- `DR_HAGIS_GLAUCOMA`
- `HRF_GLAUCOMA`
- `kaggle_ds/1_normal` and `kaggle_ds/2_glaucoma`
- `ORIGA_GLAUCOMA`
- `ORIGA_NORMAL`

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

In [4]:
glau_folders = [x for x in os.listdir() if x.endswith('GLAUCOMA')]
glau_folders.append(os.path.join('kaggle_ds', '2_glaucoma'))

print(glau_folders)

['HRF_GLAUCOMA', 'ORIGA_GLAUCOMA', 'DR_HAGIS_GLAUCOMA', 'kaggle_ds/2_glaucoma']


In [7]:
glau_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in glau_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'glaucoma', 'path': paths}, columns=['filename', 'label', 'path'])
    glau_df = glau_df.append(df)

glau_df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
96,Glaucoma_044.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_044.png
97,Glaucoma_060.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_060.png
98,Glaucoma_064.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_064.png
99,Glaucoma_058.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_058.png


In [8]:
normal_folders = [x for x in os.listdir() if x.endswith('NORMAL')]
normal_folders.append(os.path.join('kaggle_ds', '1_normal'))

print(normal_folders)

['ORIGA_NORMAL', 'kaggle_ds/1_normal']


In [9]:
normal_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in normal_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'normal', 'path': paths}, columns=['filename', 'label', 'path'])
    normal_df = normal_df.append(df)

normal_df

Unnamed: 0,filename,label,path
0,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
1,Im0277_ORIGA.jpg,normal,ORIGA_NORMAL/Im0277_ORIGA.jpg
2,Im0166_ORIGA.jpg,normal,ORIGA_NORMAL/Im0166_ORIGA.jpg
3,Im0156_ORIGA.jpg,normal,ORIGA_NORMAL/Im0156_ORIGA.jpg
4,Im0439_ORIGA.jpg,normal,ORIGA_NORMAL/Im0439_ORIGA.jpg
...,...,...,...
295,NL_181.png,normal,kaggle_ds/1_normal/NL_181.png
296,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png
297,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png
298,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png


In [10]:
df = pd.concat([glau_df, normal_df], ignore_index=True)
df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
1070,NL_181.png,normal,kaggle_ds/1_normal/NL_181.png
1071,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png
1072,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png
1073,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png


In [11]:
df.groupby('label').count()

Unnamed: 0_level_0,filename,path
label,Unnamed: 1_level_1,Unnamed: 2_level_1
glaucoma,293,293
normal,782,782


In [13]:
df['label_encoded'] = (df['label'] == 'glaucoma').astype(int)
df

Unnamed: 0,filename,label,path,label_encoded
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
...,...,...,...,...
1070,NL_181.png,normal,kaggle_ds/1_normal/NL_181.png,0
1071,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png,0
1072,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png,0
1073,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png,0


In [14]:
TEST_SIZE = 50
test_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = df[df['label_encoded'] == i]
    temp_df = temp_df.sample(TEST_SIZE)

    test_df = test_df.append(temp_df, ignore_index=True)

test_df

Unnamed: 0,filename,label,path,label_encoded
0,NL_299.png,normal,kaggle_ds/1_normal/NL_299.png,0
1,Im0016_ORIGA.jpg,normal,ORIGA_NORMAL/Im0016_ORIGA.jpg,0
2,NL_248.png,normal,kaggle_ds/1_normal/NL_248.png,0
3,NL_131.png,normal,kaggle_ds/1_normal/NL_131.png,0
4,NL_161.png,normal,kaggle_ds/1_normal/NL_161.png,0
...,...,...,...,...
95,Im0521_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0521_g_ORIGA.jpg,1
96,11_g.jpg,glaucoma,HRF_GLAUCOMA/11_g.jpg,1
97,Glaucoma_087.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_087.png,1
98,Glaucoma_050.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_050.png,1


In [15]:
train_df = df[~df['path'].isin(test_df['path'])]
train_df

Unnamed: 0,filename,label,path,label_encoded
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
5,04_g.jpg,glaucoma,HRF_GLAUCOMA/04_g.jpg,1
...,...,...,...,...
1069,NL_220.png,normal,kaggle_ds/1_normal/NL_220.png,0
1071,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png,0
1072,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png,0
1073,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png,0


In [16]:
test_df.to_csv('glaucoma_test.csv', index=False)
train_df.to_csv('glaucoma_train.csv', index=False)