In [76]:
import pandas as pd, numpy as np
import json

In [77]:
df_info = pd.read_csv('../input/HuBMAP-20-dataset_information.csv')
df_rle = pd.read_csv('../input/train.csv')

In [78]:
df_info

Unnamed: 0,image_file,width_pixels,height_pixels,anatomical_structures_segmention_file,glomerulus_segmentation_file,patient_number,race,ethnicity,sex,age,weight_kilograms,height_centimeters,bmi_kg/m^2,laterality,percent_cortex,percent_medulla
0,aa05346ff.tiff,47340,30720,aa05346ff-anatomical-structure.json,aa05346ff.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,80,20
1,afa5e8098.tiff,43780,36800,afa5e8098-anatomical-structure.json,afa5e8098.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,55,45
2,54f2eec69.tiff,22240,30440,54f2eec69-anatomical-structure.json,54f2eec69.json,67548,Black or African American,Not Hispanic or Latino,Male,58,79.9,190.5,22.0,Right,75,25
3,d488c759a.tiff,29020,46660,d488c759a-anatomical-structure.json,d488c759a.json,68138,White,Not Hispanic or Latino,Female,66,81.5,158.8,32.2,Left,100,0
4,1e2425f28.tiff,32220,26780,1e2425f28-anatomical-structure.json,1e2425f28.json,63921,White,Not Hispanic or Latino,Male,48,131.5,193.0,35.3,Right,65,35
5,e79de561c.tiff,27020,16180,e79de561c-anatomical-structure.json,e79de561c.json,67026,Black or African American,Not Hispanic or Latino,Male,53,73.0,166.0,26.5,Left,55,45
6,c68fe75ea.tiff,49780,26840,c68fe75ea-anatomical-structure.json,c68fe75ea.json,67112,White,Not Hispanic or Latino,Male,56,91.2,167.6,32.5,Left,80,20
7,095bf7a1f.tiff,39000,38160,095bf7a1f-anatomical-structure.json,095bf7a1f.json,68250,White,Not Hispanic or Latino,Female,44,71.7,160.0,28.0,Right,65,35
8,26dc41664.tiff,42360,38160,26dc41664-anatomical-structure.json,26dc41664.json,68304,White,Not Hispanic or Latino,Female,66,71.3,167.6,25.4,Left,55,45
9,57512b7f1.tiff,43160,33240,57512b7f1-anatomical-structure.json,57512b7f1.json,68555,White,Not Hispanic or Latino,Female,76,93.0,157.4,37.5,Left,80,20


In [79]:
from shapely.geometry import Polygon
all_infos = {}
for img_id in df_rle.id.unique():
    annot = json.load(open(f"../input/train/{img_id}.json", 'r'))
    structure = json.load(open(f'../input/train/{img_id}-anatomical-structure.json', 'r'))
    gloms_sizes = []
    
    for info in annot:
        coord = np.squeeze(np.array(info['geometry']['coordinates']))
        shapely_poly = Polygon(coord)
        gloms_sizes.append(shapely_poly.area)
        
    for info in structure:
        if info['properties']['classification']['name']=='Cortex':
            cortex_size = 0
            for part in  info['geometry']['coordinates']:
                cortex_coord = np.squeeze(np.array(part))
                shapely_poly = Polygon(cortex_coord)
                cortex_size += shapely_poly.area
            
    res = {"sizes": np.array(gloms_sizes), 'cortex_size': cortex_size}
    
    all_infos[img_id] = res

In [80]:
df_info['density_count'] = 0
df_info['density'] = 0
df_info['average_volume'] = 0
df_info['std_volume'] = 0
df_info['glom_count'] = 0

new_cols = ['density_count', 'density', 'average_volume', 'std_volume', 'glom_count']

for img_id, infos in all_infos.items():
    glom_count = len(infos["sizes"])
    vol_std = np.std(infos["sizes"])
    vol_avg = np.mean(infos["sizes"])
    vol_sum = np.sum(infos["sizes"])
    cortex_size = infos["cortex_size"]
    
    df_info.loc[df_info.image_file==img_id+'.tiff', "density"] = float(vol_sum / cortex_size)
    df_info.loc[df_info.image_file==img_id+'.tiff', "density_count"] = float(glom_count / cortex_size)
    df_info.loc[df_info.image_file==img_id+'.tiff', "glom_count"] = float(glom_count)
    
    df_info.loc[df_info.image_file==img_id+'.tiff', "std_volume"] = float(vol_std)
    df_info.loc[df_info.image_file==img_id+'.tiff', "average_volume"] = float(vol_avg)

In [81]:
# Removing test set and outlier
df_plot = df_info[df_info['density_count'] >0]
df_plot = df_plot[df_plot.image_file!='b2dc8411c.tiff']

In [82]:
import plotly.express as px

In [83]:
df_info.image_file.unique()

array(['aa05346ff.tiff', 'afa5e8098.tiff', '54f2eec69.tiff',
       'd488c759a.tiff', '1e2425f28.tiff', 'e79de561c.tiff',
       'c68fe75ea.tiff', '095bf7a1f.tiff', '26dc41664.tiff',
       '57512b7f1.tiff', '4ef6695ce.tiff', 'aaa6a05cc.tiff',
       'b9a3865fc.tiff', 'cb2d976f4.tiff', 'b2dc8411c.tiff',
       '2ec3f1bb9.tiff', '0486052bb.tiff', '3589adb90.tiff',
       '2f6ecfcdf.tiff', '8242609fa.tiff'], dtype=object)

In [84]:
for col in new_cols:
    fig = px.box(df_plot, y = col, points="all", x="race", color='race')
    fig.show()

In [85]:
for col in new_cols:
    fig = px.box(df_plot, y = col, points="all", x="sex", hover_name='image_file', color='sex')
    fig.show()

In [87]:
fig = px.scatter(df_plot, x='bmi_kg/m^2', y=new_cols)
fig.show()

In [88]:
fig = px.scatter(df_plot, x='age', y=new_cols)
fig.show()

In [111]:
fig = px.scatter(df_plot, x='bmi_kg/m^2', y="density", color='sex', size=[1]*len(df_plot), symbol='race',
                 title="Evolution of cortex density with BMI")
fig.show()

In [112]:
fig = px.scatter(df_plot, x='age', y="average_volume", color='sex', size=[1]*len(df_plot), symbol='race',
                 title="Evolution of glomeruli volumes with age")
fig.show()

In [74]:
new_cols

['density_count', 'density', 'average_volume', 'std_volume', 'glom_count']

In [72]:

for col in new_cols:
    fig = px.scatter(df_plot, x='bmi_kg/m^2', y=col)
    fig.show()

In [69]:
px.box(df_plot, y = 'density_count', points="all", x=["race", "sex"])

In [71]:
df_plot.columns

Index(['image_file', 'width_pixels', 'height_pixels',
       'anatomical_structures_segmention_file', 'glomerulus_segmentation_file',
       'patient_number', 'race', 'ethnicity', 'sex', 'age', 'weight_kilograms',
       'height_centimeters', 'bmi_kg/m^2', 'laterality', 'percent_cortex',
       'percent_medulla', 'density_count', 'density', 'average_volume',
       'std_volume', 'glom_count'],
      dtype='object')

In [None]:


px.box(df_info, y = 'density_count', points="all", x=["race"])