This notebook prepares a train and test dataframe from the following sources:
- `DR_HAGIS_GLAUCOMA`
- `HRF_GLAUCOMA`
- `kaggle_ds/1_normal` and `kaggle_ds/2_glaucoma`
- `ORIGA_GLAUCOMA`
- `ORIGA_NORMAL`

In [21]:
import os

import pandas as pd
import matplotlib.pyplot as plt

## Import Drishti dataset

In [30]:
drishti = pd.read_csv('Drishti.csv')
drishti

Unnamed: 0,Drishti-GS File,Patient ID,Marking 1,Marking 2,Marking 3,Marking 4,Additional Expert,Total,Unnamed: 8,Unnamed: 9
0,drishtiGS_001',1077987.0,-1.0,-1.0,1.0,1.0,1.0,Glaucomatous,,
1,drishtiGS_002',1167573.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
2,drishtiGS_003',1393265.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
3,drishtiGS_004',1393265.0,1.0,1.0,-1.0,1.0,1.0,Glaucomatous,,
4,drishtiGS_005',1481925.0,1.0,1.0,1.0,1.0,1.0,Glaucomatous,,
...,...,...,...,...,...,...,...,...,...,...
100,drishtiGS_101',2864841.0,-1.0,-1.0,-1.0,-1.0,-1.0,Normal,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,,,,,,,,


In [31]:
drishti = drishti.loc[:, ['Drishti-GS File', 'Total']]
drishti.columns = ['filename', 'label']
drishti['filename'] = drishti['filename'].str[:-1] + '.png'
drishti['path'] = 'Drishti/' + drishti['filename']

drishti_glau = drishti[drishti['label'] == 'Glaucomatous']
drishti_norm = drishti[drishti['label'] == 'Normal']

drishti_glau['label'] = 'glaucoma'
drishti_norm['label'] = 'normal'

drishti_glau

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,filename,label,path
0,drishtiGS_001.png,glaucoma,Drishti/drishtiGS_001.png
1,drishtiGS_002.png,glaucoma,Drishti/drishtiGS_002.png
2,drishtiGS_003.png,glaucoma,Drishti/drishtiGS_003.png
3,drishtiGS_004.png,glaucoma,Drishti/drishtiGS_004.png
4,drishtiGS_005.png,glaucoma,Drishti/drishtiGS_005.png
...,...,...,...
82,drishtiGS_083.png,glaucoma,Drishti/drishtiGS_083.png
83,drishtiGS_084.png,glaucoma,Drishti/drishtiGS_084.png
85,drishtiGS_086.png,glaucoma,Drishti/drishtiGS_086.png
86,drishtiGS_087.png,glaucoma,Drishti/drishtiGS_087.png


In [43]:
len(drishti_norm)dd

31

## Import RIMONE dataset (all three releases)

In [32]:
with open('RIMONE_glau.txt') as f:
    rimone_glau = f.readline().rstrip().split(',')
with open('RIMONE_norm.txt') as f:
    rimone_norm = f.readline().rstrip().split(',')

rimone_norm[:10]

['RIM_ONE/Normal/Im024.bmp',
 'RIM_ONE/Normal/Im050.bmp',
 'RIM_ONE/Normal/Im089.bmp',
 'RIM_ONE/Normal/Im143.bmp',
 'RIM_ONE/Normal/Im066.bmp',
 'RIM_ONE/Normal/Im159.bmp',
 'RIM_ONE/Normal/Im077.bmp',
 'RIM_ONE/Normal/Im161.bmp',
 'RIM_ONE/Normal/Im041.bmp',
 'RIM_ONE/Normal/Im012.bmp']

In [33]:
rimone_glau_df = pd.DataFrame({'label': 'glaucoma', 'path': rimone_glau})
rimone_norm_df = pd.DataFrame({'label': 'normal', 'path': rimone_norm})

rimone_glau_df['filename'] = rimone_glau_df['path'].str.split(os.path.sep).str[-1]
rimone_norm_df['filename'] = rimone_norm_df['path'].str.split(os.path.sep).str[-1]

def update_rimone_path(path):
    parts = path.split(os.path.sep)

    dir = parts[0]
    if   dir == 'RIM_ONE': dir = 'RIMONE_r1'
    elif dir == 'RIM_TWO': dir = 'RIMONE_r2'
    elif dir == 'RIM_THREE_CROPPED': dir = 'RIMONE_r3_CROPPED'

    return os.path.sep.join([dir] + parts[1:])

rimone_glau_df['path'] = rimone_glau_df['path'].apply(update_rimone_path)
rimone_norm_df['path'] = rimone_norm_df['path'].apply(update_rimone_path)

rimone_glau_df

Unnamed: 0,label,path,filename
0,glaucoma,RIMONE_r1/Early/Im017.bmp,Im017.bmp
1,glaucoma,RIMONE_r1/Early/Im051.bmp,Im051.bmp
2,glaucoma,RIMONE_r1/Early/Im049.bmp,Im049.bmp
3,glaucoma,RIMONE_r1/Early/Im034.bmp,Im034.bmp
4,glaucoma,RIMONE_r1/Early/Im009.bmp,Im009.bmp
...,...,...,...
291,glaucoma,RIMONE_r3_CROPPED/S-15-L.jpg,S-15-L.jpg
292,glaucoma,RIMONE_r3_CROPPED/S-27-L.jpg,S-27-L.jpg
293,glaucoma,RIMONE_r3_CROPPED/G-22-L.jpg,G-22-L.jpg
294,glaucoma,RIMONE_r3_CROPPED/S-20-L.jpg,S-20-L.jpg


In [26]:
len(rimone_norm_df[rimone_norm_df['path'].str.startswith('RIMONE_r1')])

118

## Import HRF, DR HAGIS and ORIGA datasets

In [34]:
glau_folders = [x for x in os.listdir() if x.endswith('GLAUCOMA')]
glau_folders.append(os.path.join('kaggle_ds', '2_glaucoma'))

print(glau_folders)

['HRF_GLAUCOMA', 'ORIGA_GLAUCOMA', 'DR_HAGIS_GLAUCOMA', 'kaggle_ds/2_glaucoma']


In [37]:
glau_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in glau_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'glaucoma', 'path': paths}, columns=['filename', 'label', 'path'])
    glau_df = glau_df.append(df)

glau_df

Unnamed: 0,filename,label,path
0,03_g.jpg,glaucoma,HRF_GLAUCOMA/03_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,10_g.jpg,glaucoma,HRF_GLAUCOMA/10_g.jpg
4,15_g.jpg,glaucoma,HRF_GLAUCOMA/15_g.jpg
...,...,...,...
96,Glaucoma_100.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_100.png
97,Glaucoma_030.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_030.png
98,Glaucoma_053.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_053.png
99,Glaucoma_066.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_066.png


## Combine all Glaucoma samples

In [38]:
glau_df = glau_df.append(drishti_glau, ignore_index=True)
glau_df = glau_df.append(rimone_glau_df, ignore_index=True)
glau_df

Unnamed: 0,filename,label,path
0,03_g.jpg,glaucoma,HRF_GLAUCOMA/03_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,10_g.jpg,glaucoma,HRF_GLAUCOMA/10_g.jpg
4,15_g.jpg,glaucoma,HRF_GLAUCOMA/15_g.jpg
...,...,...,...
655,S-15-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-15-L.jpg
656,S-27-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-27-L.jpg
657,G-22-L.jpg,glaucoma,RIMONE_r3_CROPPED/G-22-L.jpg
658,S-20-L.jpg,glaucoma,RIMONE_r3_CROPPED/S-20-L.jpg


In [39]:
normal_folders = [x for x in os.listdir() if x.endswith('NORMAL')]
normal_folders.append(os.path.join('kaggle_ds', '1_normal'))

print(normal_folders)

['ORIGA_NORMAL', 'HRF_NORMAL', 'kaggle_ds/1_normal']


In [40]:
normal_df = pd.DataFrame(columns=['filename', 'label', 'path'])

for folder in normal_folders:
    filenames = os.listdir(folder)
    paths = list(map(lambda x: os.path.join(folder, x), filenames))

    df = pd.DataFrame({'filename': filenames, 'label': 'normal', 'path': paths}, columns=['filename', 'label', 'path'])
    normal_df = normal_df.append(df)

normal_df

Unnamed: 0,filename,label,path
0,Im0132_ORIGA.jpg,normal,ORIGA_NORMAL/Im0132_ORIGA.jpg
1,Im0134_ORIGA.jpg,normal,ORIGA_NORMAL/Im0134_ORIGA.jpg
2,Im0003_ORIGA.jpg,normal,ORIGA_NORMAL/Im0003_ORIGA.jpg
3,Im0130_ORIGA.jpg,normal,ORIGA_NORMAL/Im0130_ORIGA.jpg
4,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
...,...,...,...
295,NL_083.png,normal,kaggle_ds/1_normal/NL_083.png
296,NL_189.png,normal,kaggle_ds/1_normal/NL_189.png
297,NL_041.png,normal,kaggle_ds/1_normal/NL_041.png
298,NL_087.png,normal,kaggle_ds/1_normal/NL_087.png


## Combine all normal samples

In [49]:
normal_df = normal_df.append(drishti_norm, ignore_index=True)
normal_df = normal_df.append(rimone_norm_df, ignore_index=True)
normal_df

Unnamed: 0,filename,label,path
0,Im0132_ORIGA.jpg,normal,ORIGA_NORMAL/Im0132_ORIGA.jpg
1,Im0134_ORIGA.jpg,normal,ORIGA_NORMAL/Im0134_ORIGA.jpg
2,Im0003_ORIGA.jpg,normal,ORIGA_NORMAL/Im0003_ORIGA.jpg
3,Im0130_ORIGA.jpg,normal,ORIGA_NORMAL/Im0130_ORIGA.jpg
4,Im0176_ORIGA.jpg,normal,ORIGA_NORMAL/Im0176_ORIGA.jpg
...,...,...,...
1689,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg
1690,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg
1691,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg
1692,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg


In [59]:
len(normal_df[normal_df['path'].str.startswith('HRF')])

15

In [50]:
df = pd.concat([glau_df, normal_df], ignore_index=True)
df

Unnamed: 0,filename,label,path
0,03_g.jpg,glaucoma,HRF_GLAUCOMA/03_g.jpg
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg
3,10_g.jpg,glaucoma,HRF_GLAUCOMA/10_g.jpg
4,15_g.jpg,glaucoma,HRF_GLAUCOMA/15_g.jpg
...,...,...,...
2349,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg
2350,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg
2351,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg
2352,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg


## Count number of samples for each type

In [51]:
df.groupby('label').count()

Unnamed: 0_level_0,filename,path
label,Unnamed: 1_level_1,Unnamed: 2_level_1
glaucoma,660,660
normal,1694,1694


In [60]:
df['label_encoded'] = (df['label'] == 'glaucoma').astype(int)
df

Unnamed: 0,filename,label,path,label_encoded
0,03_g.jpg,glaucoma,HRF_GLAUCOMA/03_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,10_g.jpg,glaucoma,HRF_GLAUCOMA/10_g.jpg,1
4,15_g.jpg,glaucoma,HRF_GLAUCOMA/15_g.jpg,1
...,...,...,...,...
2349,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg,0
2350,N-67-L.jpg,normal,RIMONE_r3_CROPPED/N-67-L.jpg,0
2351,N-23-L.jpg,normal,RIMONE_r3_CROPPED/N-23-L.jpg,0
2352,N-36-R.jpg,normal,RIMONE_r3_CROPPED/N-36-R.jpg,0


In [61]:
TEST_SIZE = 100
VAL_SIZE = 100
test_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = df[df['label_encoded'] == i]
    temp_df = temp_df.sample(TEST_SIZE, random_state=0)
    test_df = test_df.append(temp_df, ignore_index=True)

test_df

Unnamed: 0,filename,label,path,label_encoded
0,Im0076_ORIGA.jpg,normal,ORIGA_NORMAL/Im0076_ORIGA.jpg,0
1,Im0107_ORIGA.jpg,normal,ORIGA_NORMAL/Im0107_ORIGA.jpg,0
2,Im0169_ORIGA.jpg,normal,ORIGA_NORMAL/Im0169_ORIGA.jpg,0
3,Im0266_ORIGA.jpg,normal,ORIGA_NORMAL/Im0266_ORIGA.jpg,0
4,NL_289.png,normal,kaggle_ds/1_normal/NL_289.png,0
...,...,...,...,...
195,Glaucoma_037.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_037.png,1
196,Glaucoma_060.png,glaucoma,kaggle_ds/2_glaucoma/Glaucoma_060.png,1
197,Im0561_g_ORIGA.jpg,glaucoma,ORIGA_GLAUCOMA/Im0561_g_ORIGA.jpg,1
198,Im361.jpg,glaucoma,RIMONE_r2/Glaucoma and glaucoma suspicious/Im3...,1


In [62]:
remain_df = df[~df['path'].isin(test_df['path'])]
val_df = pd.DataFrame(columns=df.columns)

for i in range(2):
    temp_df = remain_df[remain_df['label_encoded'] == i]
    temp_df = temp_df.sample(VAL_SIZE, random_state=0)
    val_df = val_df.append(temp_df, ignore_index=True)

val_df

Unnamed: 0,filename,label,path,label_encoded
0,Im0207_ORIGA.jpg,normal,ORIGA_NORMAL/Im0207_ORIGA.jpg,0
1,Im139.jpg,normal,RIMONE_r2/Normal/Im139.jpg,0
2,NL_192.png,normal,kaggle_ds/1_normal/NL_192.png,0
3,Im0415_ORIGA.jpg,normal,ORIGA_NORMAL/Im0415_ORIGA.jpg,0
4,Im104.bmp,normal,RIMONE_r1/Normal/Im104.bmp,0
...,...,...,...,...
195,Im283.jpg,glaucoma,RIMONE_r2/Glaucoma and glaucoma suspicious/Im2...,1
196,Im330.jpg,glaucoma,RIMONE_r2/Glaucoma and glaucoma suspicious/Im3...,1
197,G-12-L.jpg,glaucoma,RIMONE_r3_CROPPED/G-12-L.jpg,1
198,drishtiGS_029.png,glaucoma,Drishti/drishtiGS_029.png,1


In [63]:
train_df = remain_df[~remain_df['path'].isin(val_df['path'])]
train_df

Unnamed: 0,filename,label,path,label_encoded
0,03_g.jpg,glaucoma,HRF_GLAUCOMA/03_g.jpg,1
1,05_g.jpg,glaucoma,HRF_GLAUCOMA/05_g.jpg,1
2,14_g.jpg,glaucoma,HRF_GLAUCOMA/14_g.jpg,1
3,10_g.jpg,glaucoma,HRF_GLAUCOMA/10_g.jpg,1
4,15_g.jpg,glaucoma,HRF_GLAUCOMA/15_g.jpg,1
...,...,...,...,...
2344,N-13-L.jpg,normal,RIMONE_r3_CROPPED/N-13-L.jpg,0
2345,N-21-L.jpg,normal,RIMONE_r3_CROPPED/N-21-L.jpg,0
2348,N-74-L.jpg,normal,RIMONE_r3_CROPPED/N-74-L.jpg,0
2349,N-43-L.jpg,normal,RIMONE_r3_CROPPED/N-43-L.jpg,0


In [64]:
print("Test set: ", len(test_df))
print("Validation set: ", len(val_df))
print("Train set: ", len(train_df))

Test set:  200
Validation set:  200
Train set:  1859


In [65]:
test_df.to_csv('glaucoma_test.csv', index=False)
val_df.to_csv('glaucoma_val.csv', index=False)
train_df.to_csv('glaucoma_train.csv', index=False)