This notebook prepares a train and test dataframe from the following sources:
- `DR_HAGIS_GLAUCOMA`
- `HRF_GLAUCOMA`
- `kaggle_ds/1_normal` and `kaggle_ds/2_glaucoma`
- `ORIGA_GLAUCOMA`
- `ORIGA_NORMAL`

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

In [31]:
drishti = pd.read_csv('Drishti.csv')
drishti

Unnamed: 0,Drishti-GS File,Patient ID,Marking 1,Marking 2,Marking 3,Marking 4,Additional Expert,Total,Unnamed: 8,Unnamed: 9
0,drishtiGS_001',1077987.0,-1.0,-1.0,1.0,1.0,1.0,Glaucomatous,,
1,drishtiGS_002',1167573.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
2,drishtiGS_003',1393265.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
3,drishtiGS_004',1393265.0,1.0,1.0,-1.0,1.0,1.0,Glaucomatous,,
4,drishtiGS_005',1481925.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
...,...,...,...,...,...,...,...,...,...,...
100,drishtiGS_101',2864841.0,-1.0,-1.0,-1.0,-1.0,-1.0,Normal,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,,,,,,,,


In [32]:
drishti = drishti.loc[:, ['Drishti-GS File', 'Total']]
drishti.columns = ['filename', 'label']
drishti['filename'] = drishti['filename'].str[:-1] + '.png'
drishti['path'] = 'Drishti/' + drishti['filename']

drishti_glau = drishti[drishti['label'] == 'Glaucomatous']
drishti_norm = drishti[drishti['label'] == 'Normal']

drishti_glau['label'] = 'glaucoma'
drishti_norm['label'] = 'normal'

drishti_glau

Unnamed: 0,filename,label,path
0,drishtiGS_001.png,glaucoma,Drishti/drishtiGS_001.png
1,drishtiGS_002.png,glaucoma,Drishti/drishtiGS_002.png
2,drishtiGS_003.png,glaucoma,Drishti/drishtiGS_003.png
3,drishtiGS_004.png,glaucoma,Drishti/drishtiGS_004.png
4,drishtiGS_005.png,glaucoma,Drishti/drishtiGS_005.png
...,...,...,...
82,drishtiGS_083.png,glaucoma,Drishti/drishtiGS_083.png
83,drishtiGS_084.png,glaucoma,Drishti/drishtiGS_084.png
85,drishtiGS_086.png,glaucoma,Drishti/drishtiGS_086.png
86,drishtiGS_087.png,glaucoma,Drishti/drishtiGS_087.png


In [33]:
glau_folders = [x for x in os.listdir() if x.endswith('GLAUCOMA')]
glau_folders.append(os.path.join('kaggle_ds', '2_glaucoma'))

print(glau_folders)

['HRF_GLAUCOMA', 'ORIGA_GLAUCOMA', 'DR_HAGIS_GLAUCOMA', 'kaggle_ds/2_glaucoma']


In [34]:
glau_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in glau_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'glaucoma', 'path': paths}, columns=['filename', 'label', 'path'])
    glau_df = glau_df.append(df)

glau_df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
96,Glaucoma_044.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_044.png
97,Glaucoma_060.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_060.png
98,Glaucoma_064.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_064.png
99,Glaucoma_058.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_058.png


In [35]:
glau_df = glau_df.append(drishti_glau, ignore_index=True)
glau_df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
358,drishtiGS_083.png,glaucoma,Drishti/drishtiGS_083.png
359,drishtiGS_084.png,glaucoma,Drishti/drishtiGS_084.png
360,drishtiGS_086.png,glaucoma,Drishti/drishtiGS_086.png
361,drishtiGS_087.png,glaucoma,Drishti/drishtiGS_087.png


In [36]:
normal_folders = [x for x in os.listdir() if x.endswith('NORMAL')]
normal_folders.append(os.path.join('kaggle_ds', '1_normal'))

print(normal_folders)

['ORIGA_NORMAL', 'kaggle_ds/1_normal']


In [37]:
normal_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in normal_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'normal', 'path': paths}, columns=['filename', 'label', 'path'])
    normal_df = normal_df.append(df)

normal_df

Unnamed: 0,filename,label,path
0,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
1,Im0277_ORIGA.jpg,normal,ORIGA_NORMAL/Im0277_ORIGA.jpg
2,Im0166_ORIGA.jpg,normal,ORIGA_NORMAL/Im0166_ORIGA.jpg
3,Im0156_ORIGA.jpg,normal,ORIGA_NORMAL/Im0156_ORIGA.jpg
4,Im0439_ORIGA.jpg,normal,ORIGA_NORMAL/Im0439_ORIGA.jpg
...,...,...,...
295,NL_181.png,normal,kaggle_ds/1_normal/NL_181.png
296,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png
297,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png
298,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png


In [38]:
normal_df = normal_df.append(drishti_norm, ignore_index=True)
normal_df

Unnamed: 0,filename,label,path
0,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
1,Im0277_ORIGA.jpg,normal,ORIGA_NORMAL/Im0277_ORIGA.jpg
2,Im0166_ORIGA.jpg,normal,ORIGA_NORMAL/Im0166_ORIGA.jpg
3,Im0156_ORIGA.jpg,normal,ORIGA_NORMAL/Im0156_ORIGA.jpg
4,Im0439_ORIGA.jpg,normal,ORIGA_NORMAL/Im0439_ORIGA.jpg
...,...,...,...
808,drishtiGS_097.png,normal,Drishti/drishtiGS_097.png
809,drishtiGS_098.png,normal,Drishti/drishtiGS_098.png
810,drishtiGS_099.png,normal,Drishti/drishtiGS_099.png
811,drishtiGS_100.png,normal,Drishti/drishtiGS_100.png


In [39]:
df = pd.concat([glau_df, normal_df], ignore_index=True)
df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
1171,drishtiGS_097.png,normal,Drishti/drishtiGS_097.png
1172,drishtiGS_098.png,normal,Drishti/drishtiGS_098.png
1173,drishtiGS_099.png,normal,Drishti/drishtiGS_099.png
1174,drishtiGS_100.png,normal,Drishti/drishtiGS_100.png


In [40]:
df.groupby('label').count()

Unnamed: 0_level_0,filename,path
label,Unnamed: 1_level_1,Unnamed: 2_level_1
glaucoma,363,363
normal,813,813


In [41]:
df['label_encoded'] = (df['label'] == 'glaucoma').astype(int)
df

Unnamed: 0,filename,label,path,label_encoded
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
...,...,...,...,...
1171,drishtiGS_097.png,normal,Drishti/drishtiGS_097.png,0
1172,drishtiGS_098.png,normal,Drishti/drishtiGS_098.png,0
1173,drishtiGS_099.png,normal,Drishti/drishtiGS_099.png,0
1174,drishtiGS_100.png,normal,Drishti/drishtiGS_100.png,0


In [42]:
TEST_SIZE = 75
test_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = df[df['label_encoded'] == i]
    temp_df = temp_df.sample(TEST_SIZE, random_state=0)

    test_df = test_df.append(temp_df, ignore_index=True)

test_df

Unnamed: 0,filename,label,path,label_encoded
0,NL_204.png,normal,kaggle_ds/1_normal/NL_204.png,0
1,Im0014_ORIGA.jpg,normal,ORIGA_NORMAL/Im0014_ORIGA.jpg,0
2,Im0266_ORIGA.jpg,normal,ORIGA_NORMAL/Im0266_ORIGA.jpg,0
3,Im0128_ORIGA.jpg,normal,ORIGA_NORMAL/Im0128_ORIGA.jpg,0
4,Im0462_ORIGA.jpg,normal,ORIGA_NORMAL/Im0462_ORIGA.jpg,0
...,...,...,...,...
145,Im0637_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0637_g_ORIGA.jpg,1
146,Im0635_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0635_g_ORIGA.jpg,1
147,Glaucoma_033.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_033.png,1
148,Im0523_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0523_g_ORIGA.jpg,1


In [43]:
train_df = df[~df['path'].isin(test_df['path'])]
train_df

Unnamed: 0,filename,label,path,label_encoded
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
...,...,...,...,...
1171,drishtiGS_097.png,normal,Drishti/drishtiGS_097.png,0
1172,drishtiGS_098.png,normal,Drishti/drishtiGS_098.png,0
1173,drishtiGS_099.png,normal,Drishti/drishtiGS_099.png,0
1174,drishtiGS_100.png,normal,Drishti/drishtiGS_100.png,0


In [44]:
test_df.to_csv('glaucoma_test.csv', index=False)
train_df.to_csv('glaucoma_train.csv', index=False)