In [1]:
from os import path, environ
from itertools import filterfalse, repeat, chain
from functools import partial
from operator import itemgetter, contains
from collections import namedtuple
from six import string_types
import urllib.parse

from IPython.display import Image, display, HTML

import numpy as np
import pandas as pd

df = pd \
       .read_excel('/'.join(('file://localhost',
                             path.expanduser('~').replace(path.sep, '/'),
                             'OneDrive - The University of Sydney (Students)',
                             'DR SPOC - Graders 1 and 2.xlsx')),
                   skiprows=1, header=[0,1], index_col=[0])

display(HTML('<h2>Filtered DR SPOC - Graders 1 and 2</h2>'))
df

Unnamed: 0_level_0,R1 (Right macula-centred image),R1 (Right macula-centred image),R1 (Right macula-centred image),R2 (Right optic-disc centred image),R2 (Right optic-disc centred image),R2 (Right optic-disc centred image),L1 (Left macula-centred image),L1 (Left macula-centred image),L1 (Left macula-centred image),L2 (Left optic-disc centred image),L2 (Left optic-disc centred image),L2 (Left optic-disc centred image),Overall Finding
Folder Name,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall Finding
27,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3
50,1.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,4.0,2
73,5.0,8.0,5.0,,,,5.0,8.0,5.0,,,,4
118,5.0,8.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,4
145,2.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,4.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,3.0,0.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,4
9819,5.0,8.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,5.0,8.0,5.0,4
9897,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3
9938,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,4.0,2


In [4]:
def chain_unique(*args):
    seen = set()
    yield from (v for v in chain(*args)
                if v not in seen and not seen.add(v))

display(HTML('<h2>Columns</h2>'))

display(HTML(
    '<ul>\n{}\n</ul>'.format('\n'.join(
        '  <li>"{}"</li>'.format(col)
        for col in chain_unique(map(itemgetter(1), df.axes[1]))
))))

In [5]:
manycat2threecat = {
    'Maculopathy': (
        'non-referable',    # [0] No diabetic maculopathy
        'referable',        # [1] HEx distant from the fovea
        'referable',        # [2] HEx approaching the fovea
        'referable',        # [3] HEx involving the fovea
        'referable',        # [4] Maculopathy, unspecified
        'No gradable image' # [5] No gradable image
    ),
    'ETDRS Grading': (        
        'non-referable',    # [0] No DR
        'referable',        # [1] Mild non-proliferative (mild pre-proliferative)
        'referable',        # [2] Moderate non-proliferative/ moderate pre-proliferative
        'referable',        # [3] Severe non-proliferative/ severe pre-proliferative
        'referable',        # [4] Proliferative retinopathy
        'referable',        # [5] Pre-retinal fibrosis+/- tractional retinal detachment
        'referable',        # [6] Treated proliferative retinopathy, Unstable
        'referable',        # [7] Treated proliferative retinopathy, Stable
        'No gradable image' # [8] No gradable image
    ),
    'Overall Findings': (
        np.nan,
        'referable',        # [1] Vision-threatening retinopathy
        'referable',        # [2] Non-proliferative diabetic retinopathy
        'non-referable',    # [3] No DR
        'No gradable image' # [4] Ungradable
    ),
    'Overall Quality of the Photographs Taken': (
        np.nan,
        'No gradable image', # [1] Inadequate for any diagnostic purpose
        'No gradable image', # [2] Unable to exclude emergent findings
        'No gradable image', # [3] Only able to exclude emergent findings
        'No gradable image', # [4] Not ideal but still able to exclude subtle findings
        'referable',         # [5] Ideal quality
    )
}

axes = filter(lambda c: c[:2] in frozenset(('R1', 'R2', 'L1', 'L2')),
              map(itemgetter(0), df.axes[1]))
columns = filterfalse(
    partial(contains,
            frozenset(
                ('Overall quality of the photographs taken',
                 'Overall Finding'))),
    chain_unique(map(itemgetter(1), df.axes[1])))

def to_manycat_name(o):
    if isinstance(o, string_types):
        o = o,
    
    for e in o[::-1]:
        lower_e = e.lower()
        if lower_e == 'overall quality of the photographs taken':
            return 'Overall Quality of the Photographs Taken'
        elif e.startswith('ETDRS') or e == 'Overall Findings':
            return e
        elif 'macul' in lower_e:
            # print('matched with: {!r}'.format(e))
            return 'Maculopathy'
        elif e.startswith('Overall Finding'):
            return 'Overall Findings'
        else:
            print('no match found for: {!r}'.format(e))
    
    raise TypeError('{!r} no key found for'.format(o))

def grad_mac2(series):
    def from_s(value):
        if pd.isnull(value) or isinstance(value, string_types):
            return value
        value = np.ushort(value)
        name = series.name if series.name in manycat2threecat else to_manycat_name(series.name)

        mapped = manycat2threecat.get(name)

        return value if mapped is None or len(mapped) < value else mapped[value]

    return series if series is None else series.apply(from_s)

df = df.transform(grad_mac2)
df

Unnamed: 0_level_0,R1 (Right macula-centred image),R1 (Right macula-centred image),R1 (Right macula-centred image),R2 (Right optic-disc centred image),R2 (Right optic-disc centred image),R2 (Right optic-disc centred image),L1 (Left macula-centred image),L1 (Left macula-centred image),L1 (Left macula-centred image),L2 (Left optic-disc centred image),L2 (Left optic-disc centred image),L2 (Left optic-disc centred image),Overall Finding
Folder Name,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall quality of the photographs taken,ETDRS Grading,Maculopathy,Overall Finding
27,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,non-referable
50,No gradable image,referable,non-referable,No gradable image,referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,referable,referable,referable
73,referable,No gradable image,No gradable image,,,,referable,No gradable image,No gradable image,,,,No gradable image
118,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,No gradable image
145,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,referable,referable,referable
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,No gradable image,non-referable,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,No gradable image
9819,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,referable,No gradable image,No gradable image,No gradable image
9897,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,No gradable image,non-referable,non-referable,non-referable
9938,No gradable image,referable,referable,No gradable image,referable,referable,No gradable image,referable,referable,No gradable image,referable,referable,referable


In [9]:
df.axes

[Int64Index([  27,   50,   73,  118,  145,  166,  197,  198,  202,  212,
             ...
             9644, 9648, 9689, 9716, 9784, 9800, 9819, 9897, 9938, 9998],
            dtype='int64', length=406),
 MultiIndex([(    'R1 (Right macula-centred image)', ...),
             (    'R1 (Right macula-centred image)', ...),
             (    'R1 (Right macula-centred image)', ...),
             ('R2 (Right optic-disc centred image)', ...),
             ('R2 (Right optic-disc centred image)', ...),
             ('R2 (Right optic-disc centred image)', ...),
             (     'L1 (Left macula-centred image)', ...),
             (     'L1 (Left macula-centred image)', ...),
             (     'L1 (Left macula-centred image)', ...),
             ( 'L2 (Left optic-disc centred image)', ...),
             ( 'L2 (Left optic-disc centred image)', ...),
             ( 'L2 (Left optic-disc centred image)', ...),
             (                    'Overall Finding', ...)],
            names=[None, 'Fo

In [10]:
display(HTML('<h2>Disc-centred photo counts</h2>'))
df[filter(lambda column: 'disc-centred photo' in column,
          df.columns)] \
  .apply(pd.value_counts)

27
50
73
118
145
...
9800
9819
9897
9938
9998


- no DR & no diabetic maculopathy would classify as 'healthy'
- no gradable image would classify as 'ungradable' and 
- everything else would be 'DR'

In [None]:
LocationPid = namedtuple('LocationPid', ('location', 'pid', 'position'))

def parseFname(fname):
    bname = path.splitext(path.basename(fname))[0]
    buffer, location, pid, last_char = '', '', '', ''
    i = 0
    while i < len(bname):
        location = last_char
        if location + bname[i] in frozenset(('R1', 'R2', 'L1', 'L2')):
            pid = pid[:-1]
            break
        pid += bname[i]
        last_char = bname[i]
        i += 1
    return LocationPid(location=location, pid=pid, position='macula' if location[0] == 'R' else 'disc')

# R1 = Right macula     centred photo
# R2 = Right optic disc centred photo
# L1 = Left  macula     centred photo
# L2 = Left optic disc  centred photo

parseFname('DR SPOC Photo Dataset/6146/Upload/WA112325R2-8.jpg')

In [None]:
def sql_gen(fname, diagnosis):
    location, pid = parseFname(fname)
    prefix = 'fundus_images/'
    quoted_location = urllib.parse.quote(
        '{prefix}DR SPOC Photo Dataset/6146/Upload/WA112325R2-4.jpg'.format(prefix=prefix),
        safe=''
    )
    return '\n'.join(map(lambda l: l.lstrip(' '), '''
        BEGIN TRANSACTION;

        INSERT INTO artifact_tbl (location, "contentType")
        VALUES ('{quoted_location}', 'image/jpeg');

        INSERT INTO categorise_tbl ("artifactLocation", "categoryEnumName", category, username)
        VALUES ('{quoted_location}',
                \t'Simple-categories', '{diagnosis}', 'spreadsheet');

        END TRANSACTION;
        '''.format(quoted_location=quoted_location, diagnosis=diagnosis).split('\n')))

print(sql_gen(#(
    'DR SPOC Photo Dataset/6146/Upload/WA112325R2-8.jpg', 'REFERABLE')
    #('DR SPOC Photo Dataset/6146/Upload/WA112325R2-9.jpg', 'NON-REFERABLE'),
    #('DR SPOC Photo Dataset/6146/Upload/WA112325R2-1.jpg', 'UNGRADABLE')
#))
)

In [None]:
import psycopg2

result = urllib.parse.urlparse(environ['RDBMS_URI'])

with psycopg2.connect(
    database = result.path[1:],
    user = result.username,
    password = result.password,
    host = result.hostname
) as conn:
    with conn.cursor() as curs:
        curs.execute('SELECT 5*5')
        print(curs.fetchone())