In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

import isic
from isic import printenc, println, printdash

SEED = 6

In [2]:
# import datasets and metadata
ds24 = isic.get_2024() # dataset
md24 = ds24.get_proc_metadata() # metadata

ds_comp = isic.get_complete()
md_comp = ds_comp.get_proc_metadata()

In [3]:
# split out this cell because it's time consuming
printenc("Check dataset integrity",
         "isic_24", len(ds24.find_missing()) == 0,
         "isic archive (complete)", len(ds_comp.find_missing()) == 0)

# gettting rid of mixed type
# print(md24.iloc[:, 51])
# print(md24.iloc[:, 51].value_counts(dropna = False))

100%|███████████████████████████████████████████████| 401059/401059 [00:25<00:00, 15894.44it/s]
100%|█████████████████████████████████████████████████| 81722/81722 [00:01<00:00, 54225.61it/s]

--------------------
Check dataset integrity
isic_24
True
isic archive (complete)
True
--------------------





In [4]:
printenc("Compare columns in datasets")

col_24 = set(md24.columns.values)
col_comp = set(md_comp.columns.values)

col_unique_24 = col_24 - col_comp
col_unique_comp = col_comp - col_24
col_shared = col_comp & col_24

printenc("Columns unique to 2024 set",
         sorted(col_unique_24))

printenc("Columns unique to isic archive",
         sorted(col_unique_comp))

printenc("Shared columns",
         sorted(col_shared))

--------------------
Compare columns in datasets
--------------------
--------------------
Columns unique to 2024 set
['iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'iddx_full', 'target', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_dnn_lesion_confidence', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'tbp_tile_type']
--------------------
--------------------
Columns unique to isic archive
['acquisition_day', 'benign_malignant', 'concomitant_biopsy',

In [5]:
printenc(
"""\
Study columns unique to isic_24

https://www.kaggle.com/competitions/isic-2024-challenge/data

OBSERVATIONS

- iddx columns

These are unique to the input and it must be noted that except iddx_1 (=benign_malignant)
all the others take value NaN except for malignant samples. Unlikely to be very useful.
Also all those other tags have so few samples I doubt they can be generalisable (iddx_full)

IDDX_1 WILL BE MAPPED TO BENIGN_MALIGNANT. FIRST LETTER DOWNCASED.

IDDX_3 ROUGHLY MAPS TO DIAGNOSIS BUT HAS SIGHTLY DIFFERENT VALUES

- columns only present in test set

iddx columns for example are only present in the train set (because they give diagnosis!)
Only way I can think of using them is have the model predict them, but the sample size...

- target columns

Interestingly those samples that are labelled indeterminate are sorted into target 0 (benign)

COMPLETE SET INDETERMINATE SAMPLES WILL ALSO BE PROCESSED THUS, EXCEPT INDETERMINATE/MALIGNANT

- anatom_site_general

This column is shared between 2024 and complete sets. There are surprisingly some NA values for
TBP, but this cannot be resolved as one see that in the [tbp_lv_location] column there are equal
number of locations that are simply "unknown".

- Other columns

There are indeed many numerical metrics which are potentially be of some use:
a quick visial inspection suggests there are NO NAN'S IN MANY OF THESE COLUMNS.

However these are UNIQUE TO THE 2024 set and are products of the TBP procedure,
and so again suffers from the severe lack of positive samples. I would naively think
that a model would either learn to just ignore these parameters or somewhat overfit.\
""")

for col in sorted(col_unique_24):
    printenc(f"Statistics for column [{col}]",
             isic.dtype_based_stats(md24, col))

--------------------
Study columns unique to isic_24

https://www.kaggle.com/competitions/isic-2024-challenge/data

OBSERVATIONS

- iddx columns

These are unique to the input and it must be noted that except iddx_1 (=benign_malignant)
all the others take value NaN except for malignant samples. Unlikely to be very useful.
Also all those other tags have so few samples I doubt they can be generalisable (iddx_full)

IDDX_1 WILL BE MAPPED TO BENIGN_MALIGNANT. FIRST LETTER DOWNCASED.

IDDX_3 ROUGHLY MAPS TO DIAGNOSIS BUT HAS SIGHTLY DIFFERENT VALUES

- columns only present in test set

iddx columns for example are only present in the train set (because they give diagnosis!)
Only way I can think of using them is have the model predict them, but the sample size...

- target columns

Interestingly those samples that are labelled indeterminate are sorted into target 0 (benign)

COMPLETE SET INDETERMINATE SAMPLES WILL ALSO BE PROCESSED THUS, EXCEPT INDETERMINATE/MALIGNANT

- anatom_site_general


In [6]:
printenc(
"""\
Compare iddx_3 with diagnosis column

Overlap in some major categories while others are harder to reconcile.

One can see some there seem to mean the same thing. It would also be able to
collapse down from subcategories (eg squamous cell carcinoma, * -> squamous cell carcinoma)\

IDDX_3 WILL BE MAPPED TO DIAGNOSIS WITH ISIC.MAP_IDDX3_DIAG()
NEW COLUMN DIAG_INFERRED WILL BE PRESENT TO INDICATE THIS
TODO: PLEASE REVIEW MAPPING!\
""")

lower_first_let = lambda s: s[0].lower() + s[1:]

iddx_3_keys = set([lower_first_let(s) 
                   for s in md24['iddx_3'].unique() if type(s) is str])
diag_keys = set(md_comp['diagnosis'].unique()) - set([np.nan])

def compare_diags():
    iddx_3_unique = sorted(iddx_3_keys - diag_keys)
    diag_unique = sorted(diag_keys - iddx_3_keys)
    diag_shared = sorted(diag_keys & iddx_3_keys)
    
    println("iddx_3 unique", iddx_3_unique, sep = '\n')
    println("diagnosis unique", diag_unique, sep = '\n')
    println("shared keys", diag_shared, sep = '\n')

printenc("Compare lowercased iddx_3 with diagnosis")
compare_diags()

# with this information we go on to produce a mapping.
# the function resides in isic.py for reuse later in processing.
iddx_3_keys = set([isic.map_iddx3_diag(key) for key in iddx_3_keys])

printenc("Map iddx_3 keys to diagnosis keys")
compare_diags()

--------------------
Compare iddx_3 with diagnosis column

Overlap in some major categories while others are harder to reconcile.

One can see some there seem to mean the same thing. It would also be able to
collapse down from subcategories (eg squamous cell carcinoma, * -> squamous cell carcinoma)
IDDX_3 WILL BE MAPPED TO DIAGNOSIS WITH ISIC.MAP_IDDX3_DIAG()
NEW COLUMN DIAG_INFERRED WILL BE PRESENT TO INDICATE THIS
TODO: PLEASE REVIEW MAPPING!
--------------------
--------------------
Compare lowercased iddx_3 with diagnosis
--------------------
iddx_3 unique
['angiofibroma', 'atypical intraepithelial melanocytic proliferation', 'atypical melanocytic neoplasm', 'fibroepithelial polyp', 'hemangioma', 'hidradenoma', 'lichen planus like keratosis', 'melanoma Invasive', 'melanoma in situ', 'melanoma, NOS', 'solar or actinic keratosis', 'squamous cell carcinoma in situ', 'squamous cell carcinoma, Invasive', 'squamous cell carcinoma, NOS', 'trichilemmal or isthmic-catagen or pilar cyst']

d

In [7]:
printenc(
"""\
Study statistics shared columns between 2024 and archive.

Values are checked to make sure the naming of the labels are CONSISTENT between 2024 and complete sets.

Now in this list there are a few PARAMETERS THAT ARE MORE USEFUL since they benefit
from the much larger number of positive cases in the isic archive set.
Although one will have to consider how to deal with NAN VALUES in eg sex.

There are also a few parameters that have mostly nan values (see end of each block!)

A ROUGH SUGGESTION: age_approx, anatom_site_general, clin_size_long_diam_mm, sex\
"""
)

for col in sorted(col_shared):
    printdash()
    println(f"Statistics for column [{col}]")
    print("[2024]")
    println(isic.dtype_based_stats(md24, col))
    print("[complete]")
    print(isic.dtype_based_stats(md_comp, col))
    printdash()

--------------------
Study statistics shared columns between 2024 and archive.

Values are checked to make sure the naming of the labels are CONSISTENT between 2024 and complete sets.

Now in this list there are a few PARAMETERS THAT ARE MORE USEFUL since they benefit
from the much larger number of positive cases in the isic archive set.
Although one will have to consider how to deal with NAN VALUES in eg sex.

There are also a few parameters that have mostly nan values (see end of each block!)

A ROUGH SUGGESTION: age_approx, anatom_site_general, clin_size_long_diam_mm, sex
--------------------
--------------------
Statistics for column [PROC_source]

[2024]
PROC_source
2024    401059
Name: count, dtype: int64

[complete]
PROC_source
complete    81722
Name: count, dtype: int64
--------------------
--------------------
Statistics for column [PROC_use]

[2024]
PROC_use
training    401059
Name: count, dtype: int64

[complete]
PROC_use
    81722
Name: count, dtype: int64
-----------------

In [8]:
printenc(
"""\
Study the named diagnoses in iddx_3

By looking at their statistics we confirm our previous guesses for
assignment to benign_malignant in the benign_malignant = nan cases in
the complete set.\
"""
)

# there can be statistics in nan diagnosis but we cannot make any logical deductions
# from a null-diagnosis, even if we have the statistics.
iddx_3_keys = set(md24['iddx_3'].unique()) - set([np.nan])

for key in sorted(iddx_3_keys):
    diag = isic.map_iddx3_diag(key)
    # as noted before iddx_3 = diagnosis, iddx_1 = benign_malignant
    # diag is the key as represented in the original isic archives
    printenc(f'[{key}] mapping to diagnosis [{diag}]',
             md24[md24['iddx_3'] == key]['iddx_1'].value_counts(dropna=False),
             '',
             md_comp[md_comp['diagnosis'] == diag]['benign_malignant'].value_counts(dropna=False))

--------------------
Study the named diagnoses in iddx_3

By looking at their statistics we confirm our previous guesses for
assignment to benign_malignant in the benign_malignant = nan cases in
the complete set.
--------------------
--------------------
[Angiofibroma] mapping to diagnosis [angiofibroma or fibrous papule]
iddx_1
Benign    2
Name: count, dtype: int64

benign_malignant
benign    4
Name: count, dtype: int64
--------------------
--------------------
[Atypical intraepithelial melanocytic proliferation] mapping to diagnosis [AIMP]
iddx_1
Indeterminate    11
Name: count, dtype: int64

benign_malignant
indeterminate              61
indeterminate/malignant    44
indeterminate/benign       14
benign                      2
Name: count, dtype: int64
--------------------
--------------------
[Atypical melanocytic neoplasm] mapping to diagnosis [atypical melanocytic proliferation]
iddx_1
Indeterminate    64
Name: count, dtype: int64

benign_malignant
indeterminate              60
in