In [165]:
import pandas as pd
import numpy as np

In [166]:
BASEPATH = "../../data/full_df.csv"
df= pd.read_csv(BASEPATH)

In [167]:
#renaming relou features
df = df.rename(columns = {'Left-Diagnostic Keywords' : 'Ldiag', 'Right-Diagnostic Keywords': 'Rdiag'})

In [168]:
df.columns

Index(['ID', 'Patient Age', 'Patient Sex', 'Left-Fundus', 'Right-Fundus',
       'Ldiag', 'Rdiag', 'N', 'D', 'G', 'C', 'A', 'H', 'M', 'O', 'filepath',
       'labels', 'target', 'filename'],
      dtype='object')

### Removing bad photos -> ['optic disk photographically invisible', 'lens dust']

In [169]:
keywords= ['optic disk photographically invisible', 'lens dust']

df = df[df['Ldiag'].map(lambda x : all(kw not in x for kw in keywords))]

### Create dico desease and check

In [170]:
#                             [N, D, G, C, A, H, M, O]
dico_ds = {
    'normal'      : pd.Series([1, 0, 0, 0, 0, 0, 0, 0]),
    'retinopathy' : pd.Series([0, 1, 0, 0, 0, 0, 0, 0]),
    'glaucoma'    : pd.Series([0, 0, 1, 0, 0, 0, 0, 0]),
    'cataract'    : pd.Series([0, 0, 0, 1, 0, 0, 0, 0]),
    'age'         : pd.Series([0, 0, 0, 0, 1, 0, 0, 0]),
    'hypertensive': pd.Series([0, 0, 0, 0, 0, 1, 0, 0]),
    'myopi'       : pd.Series([0, 0, 0, 0, 0, 0, 1, 0])
}

In [171]:
def check_diag(word, feat):
    tmp_df = df[df[feat]==1][['Ldiag', 'Rdiag']]
    mask = tmp_df.apply(lambda x : word in x[0] or word in x[1] , axis = 1)
    print(f'On a loupé {len(tmp_df[-mask])} observations pour {word}')
    

In [172]:
check_diag('normal', 'N')
check_diag('retinopathy', 'D')
check_diag('glaucoma', 'G')
check_diag('cataract', 'C')
check_diag('age', 'A')
check_diag('myopi', 'M')

On a loupé 0 observations pour normal
On a loupé 0 observations pour retinopathy
On a loupé 0 observations pour glaucoma
On a loupé 0 observations pour cataract
On a loupé 0 observations pour age
On a loupé 0 observations pour myopi


### create df unique left and right eye

In [173]:
def create_target(diag, dico = dico_ds):
    res = np.zeros(8, dtype=int)
    for kw, serie in dico.items():
        if kw in diag:
            res = res + serie
    if res.sum() == 0:
        return [0, 0, 0, 0, 0, 0, 0, 1]
    else:
        return list(res)

df_right = df[['Rdiag', 'filename','target','Patient Age', 'Patient Sex']].rename(columns ={'Rdiag' : 'Diagnostic'})
df_left = df[['Ldiag', 'filename','target','Patient Age', 'Patient Sex']].rename(columns ={'Ldiag' : 'Diagnostic'})
df_all = pd.concat([df_right, df_left]).rename(columns ={'target' : 'target_init'})
df_all['Target']=df_all.Diagnostic.map(create_target)
df_all

Unnamed: 0,Diagnostic,filename,target_init,Patient Age,Patient Sex,Target
0,normal fundus,0_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",69,Female,"[1, 0, 0, 0, 0, 0, 0, 0]"
1,normal fundus,1_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",57,Male,"[1, 0, 0, 0, 0, 0, 0, 0]"
2,moderate non proliferative retinopathy,2_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",42,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"
3,mild nonproliferative retinopathy,4_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",53,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"
4,moderate non proliferative retinopathy,5_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",50,Female,"[0, 1, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...
6387,severe nonproliferative retinopathy,4686_left.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",63,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"
6388,moderate non proliferative retinopathy,4688_left.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",42,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"
6389,mild nonproliferative retinopathy,4689_left.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",54,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"
6390,mild nonproliferative retinopathy,4690_left.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",57,Male,"[0, 1, 0, 0, 0, 0, 0, 0]"


In [None]:
# #                             [N, D, G, C, A, H, M, O]
#     'normal'      : pd.Series([1, 0, 0, 0, 0, 0, 0, 0]),
#     'retinopathy' : pd.Series([0, 1, 0, 0, 0, 0, 0, 0]),
#     'glaucoma'    : pd.Series([0, 0, 1, 0, 0, 0, 0, 0]),
#     'cataract'    : pd.Series([0, 0, 0, 1, 0, 0, 0, 0]),
#     'age'         : pd.Series([0, 0, 0, 0, 1, 0, 0, 0]),
#     'hypertensive': pd.Series([0, 0, 0, 0, 0, 1, 0, 0]),
#     'myopi'       : pd.Series([0, 0, 0, 0, 0, 0, 1, 0])

In [174]:
pd.set_option('display.max_colwidth', None)
df_all[df_all.Target.map(lambda x : sum(x)>2)].head()

Unnamed: 0,Diagnostic,filename,target_init,Patient Age,Patient Sex,Target
276,wet age-related macular degeneration，hypertensive retinopathy,315_right.jpg,"[0, 0, 0, 0, 1, 0, 0, 0]",57,Female,"[0, 1, 0, 0, 1, 1, 0, 0]"
301,hypertensive retinopathy，cataract,345_right.jpg,"[0, 0, 0, 0, 0, 1, 0, 0]",64,Female,"[0, 1, 0, 1, 0, 1, 0, 0]"
319,hypertensive retinopathy，suspected glaucoma,365_right.jpg,"[0, 0, 0, 0, 0, 1, 0, 0]",60,Female,"[0, 1, 1, 0, 0, 1, 0, 0]"
856,dry age-related macular degeneration，hypertensive retinopathy,998_right.jpg,"[0, 0, 0, 0, 1, 0, 0, 0]",36,Male,"[0, 1, 0, 0, 1, 1, 0, 0]"
990,glaucoma，myopia retinopathy,1213_right.jpg,"[0, 0, 1, 0, 0, 0, 0, 0]",76,Male,"[0, 1, 1, 0, 0, 0, 1, 0]"


In [157]:
df_all[df_all.Target.apply(lambda x : x[-1]==1)].Diagnostic.unique()

array(['vitreous degeneration', 'macular epiretinal membrane',
       'myelinated nerve fibers', 'drusen', 'epiretinal membrane',
       'epiretinal membrane over the macula', 'drusen，atrophic change',
       'refractive media opacity', 'maculopathy',
       'myelinated nerve fibers，lens dust', 'drusen，drusen',
       'branch retinal vein occlusion',
       'myelinated nerve fibers，macular epiretinal membrane',
       'macular epiretinal membrane，post laser photocoagulation',
       'drusen，lens dust', 'drusen，epiretinal membrane',
       'drusen，vitreous degeneration',
       'epiretinal membrane，vessel tortuosity',
       'old branch retinal vein occlusion',
       'macular epiretinal membrane，drusen',
       'macular epiretinal membrane，atrophic change',
       'epiretinal membrane，white vessel',
       'myelinated nerve fibers，drusen',
       'myelinated nerve fibers，old branch retinal vein occlusion',
       'branch retinal artery occlusion',
       'depigmentation of the retinal 

In [175]:
list_names =['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']

def list2str(l, list_names = list_names):
    res = ''
    for i, it in enumerate(l):
        if it :
            res=res+list_names[i]
    return res

df_all['tarstr'] = df_all.Target.map(list2str)

In [176]:
df_all.head(10)

Unnamed: 0,Diagnostic,filename,target_init,Patient Age,Patient Sex,Target,tarstr
0,normal fundus,0_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",69,Female,"[1, 0, 0, 0, 0, 0, 0, 0]",N
1,normal fundus,1_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",57,Male,"[1, 0, 0, 0, 0, 0, 0, 0]",N
2,moderate non proliferative retinopathy,2_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",42,Male,"[0, 1, 0, 0, 0, 0, 0, 0]",D
3,mild nonproliferative retinopathy,4_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",53,Male,"[0, 1, 0, 0, 0, 0, 0, 0]",D
4,moderate non proliferative retinopathy,5_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",50,Female,"[0, 1, 0, 0, 0, 0, 0, 0]",D
5,moderate non proliferative retinopathy，epiretinal membrane,6_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",60,Male,"[0, 1, 0, 0, 0, 0, 0, 0]",D
6,mild nonproliferative retinopathy,7_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",60,Female,"[0, 1, 0, 0, 0, 0, 0, 0]",D
7,normal fundus,8_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",59,Male,"[1, 0, 0, 0, 0, 0, 0, 0]",N
8,vitreous degeneration,9_right.jpg,"[0, 0, 0, 0, 0, 0, 0, 1]",54,Male,"[0, 0, 0, 0, 0, 0, 0, 1]",O
9,normal fundus,10_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",70,Male,"[1, 0, 0, 0, 0, 0, 0, 0]",N


In [177]:
df_all.to_csv('full_df_cleaned.csv',index = False)