This notebook prepares a train and test dataframe from the following sources:
- `DR_HAGIS_GLAUCOMA`
- `HRF_GLAUCOMA`
- `kaggle_ds/1_normal` and `kaggle_ds/2_glaucoma`
- `ORIGA_GLAUCOMA`
- `ORIGA_NORMAL`

In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt

## Import Drishti dataset

In [2]:
drishti = pd.read_csv('Drishti.csv')
drishti

Unnamed: 0,Drishti-GS File,Patient ID,Marking 1,Marking 2,Marking 3,Marking 4,Additional Expert,Total,Unnamed: 8,Unnamed: 9
0,drishtiGS_001',1077987.0,-1.0,-1.0,1.0,1.0,1.0,Glaucomatous,,
1,drishtiGS_002',1167573.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
2,drishtiGS_003',1393265.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
3,drishtiGS_004',1393265.0,1.0,1.0,-1.0,1.0,1.0,Glaucomatous,,
4,drishtiGS_005',1481925.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
...,...,...,...,...,...,...,...,...,...,...
100,drishtiGS_101',2864841.0,-1.0,-1.0,-1.0,-1.0,-1.0,Normal,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,,,,,,,,


In [3]:
drishti = drishti.loc[:, ['Drishti-GS File', 'Total']]
drishti.columns = ['filename', 'label']
drishti['filename'] = drishti['filename'].str[:-1] + '.png'
drishti['path'] = 'Drishti/' + drishti['filename']

drishti_glau = drishti[drishti['label'] == 'Glaucomatous']
drishti_norm = drishti[drishti['label'] == 'Normal']

drishti_glau['label'] = 'glaucoma'
drishti_norm['label'] = 'normal'

drishti_glau

Unnamed: 0,filename,label,path
0,drishtiGS_001.png,glaucoma,Drishti/drishtiGS_001.png
1,drishtiGS_002.png,glaucoma,Drishti/drishtiGS_002.png
2,drishtiGS_003.png,glaucoma,Drishti/drishtiGS_003.png
3,drishtiGS_004.png,glaucoma,Drishti/drishtiGS_004.png
4,drishtiGS_005.png,glaucoma,Drishti/drishtiGS_005.png
...,...,...,...
82,drishtiGS_083.png,glaucoma,Drishti/drishtiGS_083.png
83,drishtiGS_084.png,glaucoma,Drishti/drishtiGS_084.png
85,drishtiGS_086.png,glaucoma,Drishti/drishtiGS_086.png
86,drishtiGS_087.png,glaucoma,Drishti/drishtiGS_087.png


## Import RIMONE dataset (all three releases)

In [4]:
with open('RIMONE_glau.txt') as f:
    rimone_glau = f.readline().rstrip().split(',')
with open('RIMONE_norm.txt') as f:
    rimone_norm = f.readline().rstrip().split(',')

rimone_norm[:10]

['RIM_ONE/Normal/Im024.bmp',
 'RIM_ONE/Normal/Im050.bmp',
 'RIM_ONE/Normal/Im089.bmp',
 'RIM_ONE/Normal/Im143.bmp',
 'RIM_ONE/Normal/Im066.bmp',
 'RIM_ONE/Normal/Im159.bmp',
 'RIM_ONE/Normal/Im077.bmp',
 'RIM_ONE/Normal/Im161.bmp',
 'RIM_ONE/Normal/Im041.bmp',
 'RIM_ONE/Normal/Im012.bmp']

In [5]:
rimone_glau_df = pd.DataFrame({'label': 'glaucoma', 'path': rimone_glau})
rimone_norm_df = pd.DataFrame({'label': 'normal', 'path': rimone_norm})

rimone_glau_df['filename'] = rimone_glau_df['path'].str.split(os.path.sep).str[-1]
rimone_norm_df['filename'] = rimone_norm_df['path'].str.split(os.path.sep).str[-1]

def update_rimone_path(path):
    parts = path.split(os.path.sep)

    dir = parts[0]
    if   dir == 'RIM_ONE': dir = 'RIMONE_r1'
    elif dir == 'RIM_TWO': dir = 'RIMONE_r2'
    elif dir == 'RIM_THREE_CROPPED': dir = 'RIMONE_r3_CROPPED'

    return os.path.sep.join([dir] + parts[1:])

rimone_glau_df['path'] = rimone_glau_df['path'].apply(update_rimone_path)
rimone_norm_df['path'] = rimone_norm_df['path'].apply(update_rimone_path)

rimone_glau_df

Unnamed: 0,label,path,filename
0,glaucoma,RIMONE_r1/Early/Im017.bmp,Im017.bmp
1,glaucoma,RIMONE_r1/Early/Im051.bmp,Im051.bmp
2,glaucoma,RIMONE_r1/Early/Im049.bmp,Im049.bmp
3,glaucoma,RIMONE_r1/Early/Im034.bmp,Im034.bmp
4,glaucoma,RIMONE_r1/Early/Im009.bmp,Im009.bmp
...,...,...,...
291,glaucoma,RIMONE_r3_CROPPED/S-15-L.jpg,S-15-L.jpg
292,glaucoma,RIMONE_r3_CROPPED/S-27-L.jpg,S-27-L.jpg
293,glaucoma,RIMONE_r3_CROPPED/G-22-L.jpg,G-22-L.jpg
294,glaucoma,RIMONE_r3_CROPPED/S-20-L.jpg,S-20-L.jpg


In [9]:
len(rimone_norm_df[rimone_norm_df['path'].str.startswith('RIMONE_r1')])

118

## Import HRF, DR HAGIS and ORIGA datasets

In [10]:
glau_folders = [x for x in os.listdir() if x.endswith('GLAUCOMA')]
glau_folders.append(os.path.join('kaggle_ds', '2_glaucoma'))

print(glau_folders)

['HRF_GLAUCOMA', 'ORIGA_GLAUCOMA', 'DR_HAGIS_GLAUCOMA', 'kaggle_ds/2_glaucoma']


In [11]:
glau_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in glau_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'glaucoma', 'path': paths}, columns=['filename', 'label', 'path'])
    glau_df = glau_df.append(df)

glau_df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
96,Glaucoma_044.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_044.png
97,Glaucoma_060.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_060.png
98,Glaucoma_064.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_064.png
99,Glaucoma_058.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_058.png


## Combine all Glaucoma samples

In [12]:
glau_df = glau_df.append(drishti_glau, ignore_index=True)
glau_df = glau_df.append(rimone_glau_df, ignore_index=True)
glau_df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
654,S-15-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-15-L.jpg
655,S-27-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-27-L.jpg
656,G-22-L.jpg,glaucoma,RIMONE_r3_CROPPED/G-22-L.jpg
657,S-20-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-20-L.jpg


In [13]:
normal_folders = [x for x in os.listdir() if x.endswith('NORMAL')]
normal_folders.append(os.path.join('kaggle_ds', '1_normal'))

print(normal_folders)

['ORIGA_NORMAL', 'HRF_NORMAL', 'kaggle_ds/1_normal']


In [14]:
normal_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in normal_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'normal', 'path': paths}, columns=['filename', 'label', 'path'])
    normal_df = normal_df.append(df)

normal_df

Unnamed: 0,filename,label,path
0,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
1,Im0277_ORIGA.jpg,normal,ORIGA_NORMAL/Im0277_ORIGA.jpg
2,Im0166_ORIGA.jpg,normal,ORIGA_NORMAL/Im0166_ORIGA.jpg
3,Im0156_ORIGA.jpg,normal,ORIGA_NORMAL/Im0156_ORIGA.jpg
4,Im0439_ORIGA.jpg,normal,ORIGA_NORMAL/Im0439_ORIGA.jpg
...,...,...,...
295,NL_181.png,normal,kaggle_ds/1_normal/NL_181.png
296,NL_154.png,normal,kaggle_ds/1_normal/NL_154.png
297,NL_002.png,normal,kaggle_ds/1_normal/NL_002.png
298,NL_080.png,normal,kaggle_ds/1_normal/NL_080.png


## Combine all normal samples

In [15]:
normal_df = normal_df.append(drishti_norm, ignore_index=True)
normal_df = normal_df.append(rimone_norm_df, ignore_index=True)
normal_df

Unnamed: 0,filename,label,path
0,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
1,Im0277_ORIGA.jpg,normal,ORIGA_NORMAL/Im0277_ORIGA.jpg
2,Im0166_ORIGA.jpg,normal,ORIGA_NORMAL/Im0166_ORIGA.jpg
3,Im0156_ORIGA.jpg,normal,ORIGA_NORMAL/Im0156_ORIGA.jpg
4,Im0439_ORIGA.jpg,normal,ORIGA_NORMAL/Im0439_ORIGA.jpg
...,...,...,...
1240,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg
1241,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg
1242,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg
1243,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg


In [16]:
len(normal_df[normal_df['path'].str.startswith('HRF')])

15

In [17]:
df = pd.concat([glau_df, normal_df], ignore_index=True)
df

Unnamed: 0,filename,label,path
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg
...,...,...,...
1899,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg
1900,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg
1901,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg
1902,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg


## Count number of samples for each type

In [18]:
df.groupby('label').count()

Unnamed: 0_level_0,filename,path
label,Unnamed: 1_level_1,Unnamed: 2_level_1
glaucoma,659,659
normal,1245,1245


In [19]:
df['label_encoded'] = (df['label'] == 'glaucoma').astype(int)
df

Unnamed: 0,filename,label,path,label_encoded
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
...,...,...,...,...
1899,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg,0
1900,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg,0
1901,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg,0
1902,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg,0


In [20]:
TEST_SIZE = 100
VAL_SIZE = 100
test_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = df[df['label_encoded'] == i]
    temp_df = temp_df.sample(TEST_SIZE, random_state=0)
    test_df = test_df.append(temp_df, ignore_index=True)

test_df

Unnamed: 0,filename,label,path,label_encoded
0,Im099.jpg,normal,RIMONE_r2/Normal/Im099.jpg,0
1,Im0330_ORIGA.jpg,normal,ORIGA_NORMAL/Im0330_ORIGA.jpg,0
2,Im0165_ORIGA.jpg,normal,ORIGA_NORMAL/Im0165_ORIGA.jpg,0
3,NL_149.png,normal,kaggle_ds/1_normal/NL_149.png,0
4,Im0092_ORIGA.jpg,normal,ORIGA_NORMAL/Im0092_ORIGA.jpg,0
...,...,...,...,...
195,Im0554_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0554_g_ORIGA.jpg,1
196,Im017.bmp,glaucoma,RIMONE_r1/Early/Im017.bmp,1
197,Im0550_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0550_g_ORIGA.jpg,1
198,Im013.bmp,glaucoma,RIMONE_r1/Deep/Im013.bmp,1


In [21]:
remain_df = df[~df['path'].isin(test_df['path'])]
val_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = remain_df[remain_df['label_encoded'] == i]
    temp_df = temp_df.sample(VAL_SIZE, random_state=0)
    val_df = val_df.append(temp_df, ignore_index=True)

val_df

Unnamed: 0,filename,label,path,label_encoded
0,Im0181_ORIGA.jpg,normal,ORIGA_NORMAL/Im0181_ORIGA.jpg,0
1,NL_086.png,normal,kaggle_ds/1_normal/NL_086.png,0
2,NL_064.png,normal,kaggle_ds/1_normal/NL_064.png,0
3,drishtiGS_033.png,normal,Drishti/drishtiGS_033.png,0
4,drishtiGS_089.png,normal,Drishti/drishtiGS_089.png,0
...,...,...,...,...
195,Im283.jpg,glaucoma,RIMONE_r2/Glaucoma and glaucoma suspicious/Im2...,1
196,Im422.jpg,glaucoma,RIMONE_r2/Glaucoma and glaucoma suspicious/Im4...,1
197,S-25-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-25-L.jpg,1
198,drishtiGS_031.png,glaucoma,Drishti/drishtiGS_031.png,1


In [22]:
train_df = remain_df[~remain_df['path'].isin(val_df['path'])]
train_df

Unnamed: 0,filename,label,path,label_encoded
0,07_g.jpg,glaucoma,HRF_GLAUCOMA/07_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,01_g.jpg,glaucoma,HRF_GLAUCOMA/01_g.jpg,1
4,06_g.jpg,glaucoma,HRF_GLAUCOMA/06_g.jpg,1
...,...,...,...,...
1898,N-74-L.jpg,normal,RIMONE_r3_CROPPED/N-74-L.jpg,0
1899,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg,0
1900,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg,0
1901,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg,0


In [23]:
print("Test set: ", len(test_df))
print("Validation set: ", len(val_df))
print("Train set: ", len(train_df))

Test set:  200
Validation set:  200
Train set:  1504


In [24]:
test_df.to_csv('glaucoma_test.csv', index=False)
val_df.to_csv('glaucoma_val.csv', index=False)
train_df.to_csv('glaucoma_train.csv', index=False)