# N are different across deployed models
- [ ] check N between multiseed run and previous reported data


In [122]:
# packages
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
from functools import reduce
import itertools
from tqdm import tqdm

from sklearn.utils import resample

# # local
from nb_util import autoreload, add_dirs
# autoreload()

add_dirs(['../'])
from nash_util import concordance_stats as cs
from nash_util import nash_definitions as nd
from nash_util import nash_changes as nc


add_dirs(['~/Projects/Code/ml-platform'])
from pathai.api.slides import slides_base

In [123]:
## load image ID data
atlas_ids = pd.read_excel('REFDATA/ATLAS_ImageIDs.xlsx', engine='openpyxl')
atlas_ids = atlas_ids[['USUBJID','HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']].copy()

In [124]:
any(atlas_ids[['USUBJID']].duplicated())

False

In [125]:
atlas_ids.shape

(616, 5)

In [126]:
atlas_ids.dropna().shape

(303, 5)

In [127]:
per_subject_df = atlas_ids.copy()
per_subject_old_df = atlas_ids.copy()

In [128]:
per_subject_df.loc[per_subject_df['USUBJID']=="GS-US-454-4378-12657-91216", 'TRICHROME_ID_W48'] = per_subject_old_df.loc[per_subject_old_df['USUBJID']=="GS-US-454-4378-12657-91216", 'HE_ID_W48']
per_subject_df.loc[per_subject_df['USUBJID']=="GS-US-454-4378-12657-91216", 'HE_ID_W48'] = per_subject_old_df.loc[per_subject_old_df['USUBJID']=="GS-US-454-4378-12657-91216", 'TRICHROME_ID_W48']

per_subject_df.loc[per_subject_df['USUBJID']=="GS-US-454-4378-13744-91182", 'TRICHROME_ID_W48'] = per_subject_old_df.loc[per_subject_old_df['USUBJID']=="GS-US-454-4378-13744-91182", 'HE_ID_W48']
per_subject_df.loc[per_subject_df['USUBJID']=="GS-US-454-4378-13744-91182", 'HE_ID_W48'] = per_subject_old_df.loc[per_subject_old_df['USUBJID']=="GS-US-454-4378-13744-91182", 'TRICHROME_ID_W48']


In [129]:
# atlas
tc_atlas = pd.read_excel("REFDATA/GILEAD_V2/NASH_ATLAS_NASH_Trichrome_Features_2022-02-17_14_48.xlsx", sheet_name='FEATURES', engine='openpyxl')
he_atlas = pd.read_excel("REFDATA/GILEAD_V2/NASH_ATLAS_NASH_HE_Features_2022-02-17_14_48.xlsx", sheet_name='FEATURES',engine='openpyxl')
he_atlas.set_index('PATHAI_ID', inplace=True)
tc_atlas.set_index('PATHAI_ID', inplace=True)

In [130]:
# st3
tc_st3 = pd.read_excel("REFDATA/GILEAD_V2/GILD-S-1-NASH-Stellar3_NASH_Trichrome_Features_2022-02-22_23_33.xlsx", sheet_name='FEATURES', engine='openpyxl')
he_st3 = pd.read_excel("REFDATA/GILEAD_V2/GILD-S-1-NASH-Stellar3_NASH_HE_Features_2022-02-22_23_33.xlsx", sheet_name='FEATURES',engine='openpyxl')
he_st3.set_index('PATHAI_ID', inplace=True)
tc_st3.set_index('PATHAI_ID', inplace=True)

In [131]:
# st4
tc_st4 = pd.read_excel("REFDATA/GILEAD_V2/NASH_Stellar4_NASH_Trichrome_Features_2022-02-23_15_58.xlsx", sheet_name='FEATURES', engine='openpyxl')
he_st4 = pd.read_excel("REFDATA/GILEAD_V2/NASH_Stellar4_NASH_HE_Features_2022-02-23_15_58.xlsx", sheet_name='FEATURES',engine='openpyxl')
he_st4.set_index('PATHAI_ID', inplace=True)
tc_st4.set_index('PATHAI_ID', inplace=True)

In [132]:
clinical = pd.read_excel("REFDATA/20191219-ATLAS-MLfeatures+clinicalData.xlsx", sheet_name='PathAI Feature', engine='openpyxl')
clin_cols = ['USUBJID','TRT01A','PE48NRI','FI48NRI','NAS48NRI','HS48OC','LI48OC','HB48OC','DIABN','NR48NRI','CIRRCATN']
clin = clinical[clin_cols].copy()

In [133]:
per_subject_df = per_subject_df.merge(clin,how='left',on='USUBJID').drop_duplicates()

In [134]:
per_subject_df.head(5)

Unnamed: 0,USUBJID,HE_ID_BL,TRICHROME_ID_BL,HE_ID_W48,TRICHROME_ID_W48,TRT01A,PE48NRI,FI48NRI,NAS48NRI,HS48OC,LI48OC,HB48OC,DIABN,NR48NRI,CIRRCATN
0,GS-US-454-4378-00380-91025,182331.0,182333.0,183712.0,183785.0,SEL + GS-0976,N,N,N,N,N,N,1,N,1.0
1,GS-US-454-4378-00380-91106,182919.0,182930.0,183965.0,183972.0,SEL + GS-0976,N,N,N,,N,N,1,N,1.0
2,GS-US-454-4378-00380-91120,182502.0,182503.0,184008.0,184344.0,SEL + GS-0976,Y,Y,N,N,Y,N,0,N,1.0
3,GS-US-454-4378-00380-91141,182330.0,182332.0,184346.0,184015.0,GS-0976 + GS-9674,N,N,N,N,N,N,1,N,1.0
4,GS-US-454-4378-00380-91326,183260.0,183270.0,184190.0,184195.0,GS-0976 + GS-9674,N,N,N,N,N,Y,1,N,2.0


In [135]:
he_cols = ['GNN BALLOONING_CONTINUOUS_SCORE_HE','GNN BALLOONING_SCORE_HE','GNN LOBULAR_CONTINUOUS_SCORE_HE','GNN LOBULAR_SCORE_HE','GNN STEATOSIS_CONTINUOUS_SCORE_HE','GNN STEATOSIS_SCORE_HE',"AREA PROP [[PORTAL INFLAMMATION] OVER [TISSUE]] IN [TISSUE]_HE"]
tc_cols = ['GNN CRN_CONTINUOUS_SCORE_TRICHROME','GNN CRN_SCORE_TRICHROME','AREA PROP [[FIBROSIS] OVER [TISSUE]] IN [TISSUE]_TRICHROME']

tc_bl = pd.concat([tc_atlas[tc_cols].copy().add_suffix('_BL'),
                   tc_st3[tc_cols].copy().add_suffix('_BL'),
                   tc_st4[tc_cols].copy().add_suffix('_BL')]) 
tc_w48 = pd.concat([tc_atlas[tc_cols].copy().add_suffix('_W48'),
                   tc_st3[tc_cols].copy().add_suffix('_W48'),
                   tc_st4[tc_cols].copy().add_suffix('_W48')]) 
he_bl = pd.concat([he_atlas[he_cols].copy().add_suffix('_BL'),
                   he_st3[he_cols].copy().add_suffix('_BL'),
                   he_st4[he_cols].copy().add_suffix('_BL')]) 
he_w48 = pd.concat([he_atlas[he_cols].copy().add_suffix('_W48'),
                   he_st3[he_cols].copy().add_suffix('_W48'),
                   he_st4[he_cols].copy().add_suffix('_W48')]) 

In [136]:
assert ~any(tc_bl.index.duplicated())
assert ~any(tc_w48.index.duplicated())
assert ~any(he_bl.index.duplicated())
assert ~any(he_w48.index.duplicated())

In [137]:
working_per_subject_df = per_subject_df.copy()
working_per_subject_df = working_per_subject_df.merge(he_bl,how='left',left_on ='HE_ID_BL', right_index=True).drop_duplicates()
working_per_subject_df = working_per_subject_df.merge(he_w48,how='left',left_on ='HE_ID_W48', right_index=True).drop_duplicates()
working_per_subject_df = working_per_subject_df.merge(tc_bl,how='left',left_on ='TRICHROME_ID_BL', right_index=True).drop_duplicates()
working_per_subject_df = working_per_subject_df.merge(tc_w48,how='left',left_on ='TRICHROME_ID_W48', right_index=True).drop_duplicates()

In [138]:
working_per_subject_df.to_csv('REFDATA/WORKINGDATA/ATLAS_PER_SUBJECT_V3.csv',index=None)

In [110]:
psdf = working_per_subject_df.copy()

In [121]:
psdf.columns

Index(['USUBJID', 'HE_ID_BL', 'TRICHROME_ID_BL', 'HE_ID_W48',
       'TRICHROME_ID_W48', 'TRT01A', 'PE48NRI', 'FI48NRI', 'NAS48NRI',
       'HS48OC', 'LI48OC', 'HB48OC', 'DIABN', 'NR48NRI', 'CIRRCATN',
       'GNN BALLOONING_CONTINUOUS_SCORE_HE_BL', 'GNN BALLOONING_SCORE_HE_BL',
       'GNN LOBULAR_CONTINUOUS_SCORE_HE_BL', 'GNN LOBULAR_SCORE_HE_BL',
       'GNN STEATOSIS_CONTINUOUS_SCORE_HE_BL', 'GNN STEATOSIS_SCORE_HE_BL',
       'AREA PROP [[PORTAL INFLAMMATION] OVER [TISSUE]] IN [TISSUE]_HE_BL',
       'GNN BALLOONING_CONTINUOUS_SCORE_HE_W48', 'GNN BALLOONING_SCORE_HE_W48',
       'GNN LOBULAR_CONTINUOUS_SCORE_HE_W48', 'GNN LOBULAR_SCORE_HE_W48',
       'GNN STEATOSIS_CONTINUOUS_SCORE_HE_W48', 'GNN STEATOSIS_SCORE_HE_W48',
       'AREA PROP [[PORTAL INFLAMMATION] OVER [TISSUE]] IN [TISSUE]_HE_W48',
       'GNN CRN_CONTINUOUS_SCORE_TRICHROME_BL', 'GNN CRN_SCORE_TRICHROME_BL',
       'GNN CRN_CONTINUOUS_SCORE_TRICHROME_W48',
       'GNN CRN_SCORE_TRICHROME_W48'],
      dtype='object')

In [111]:
psdf['TRT01A'].value_counts()

SEL + GS-0976        77
SEL + GS-9674        76
GS-0976 + GS-9674    74
GS-0976              39
GS-9674              39
Placebo              39
SEL                  38
Name: TRT01A, dtype: int64

In [112]:
psdf[psdf['TRT01A'].isin(['GS-0976 + GS-9674','Placebo'])][['USUBJID','TRT01A','HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']].dropna()['TRT01A'].value_counts()

GS-0976 + GS-9674    63
Placebo              37
Name: TRT01A, dtype: int64

In [113]:
tc_cols = ['GNN CRN_SCORE_TRICHROME_BL','GNN CRN_SCORE_TRICHROME_W48','GNN CRN_CONTINUOUS_SCORE_TRICHROME_BL','GNN CRN_CONTINUOUS_SCORE_TRICHROME_W48']
he_cols = ['GNN BALLOONING_SCORE_HE_BL','GNN LOBULAR_SCORE_HE_BL','GNN STEATOSIS_SCORE_HE_BL','GNN BALLOONING_SCORE_HE_W48','GNN LOBULAR_SCORE_HE_W48','GNN STEATOSIS_SCORE_HE_W48',
          'GNN BALLOONING_CONTINUOUS_SCORE_HE_BL','GNN LOBULAR_CONTINUOUS_SCORE_HE_BL','GNN STEATOSIS_CONTINUOUS_SCORE_HE_BL','GNN BALLOONING_CONTINUOUS_SCORE_HE_W48','GNN LOBULAR_CONTINUOUS_SCORE_HE_W48','GNN STEATOSIS_CONTINUOUS_SCORE_HE_W48']

In [114]:
psdf[psdf['TRT01A'].isin(['GS-0976 + GS-9674','Placebo'])][['USUBJID','TRT01A','HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']+tc_cols+he_cols].dropna()['TRT01A'].value_counts()

GS-0976 + GS-9674    62
Placebo              37
Name: TRT01A, dtype: int64

In [115]:
df = psdf[psdf['TRT01A'].isin(['GS-0976 + GS-9674','Placebo'])][['USUBJID','TRT01A','HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']+tc_cols+he_cols].copy()

In [116]:
df = df[~df[['HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']].isnull().any(axis=1)]

In [117]:
df[df.isnull().any(axis=1)][['USUBJID','HE_ID_BL','TRICHROME_ID_BL','HE_ID_W48','TRICHROME_ID_W48']]

Unnamed: 0,USUBJID,HE_ID_BL,TRICHROME_ID_BL,HE_ID_W48,TRICHROME_ID_W48
508,GS-US-454-4378-13961-91002,152992.0,152998.0,183692.0,183693.0


In [118]:
df[df.isnull().any(axis=1)].T

Unnamed: 0,508
USUBJID,GS-US-454-4378-13961-91002
TRT01A,GS-0976 + GS-9674
HE_ID_BL,152992
TRICHROME_ID_BL,152998
HE_ID_W48,183692
TRICHROME_ID_W48,183693
GNN CRN_SCORE_TRICHROME_BL,
GNN CRN_SCORE_TRICHROME_W48,0
GNN CRN_CONTINUOUS_SCORE_TRICHROME_BL,
GNN CRN_CONTINUOUS_SCORE_TRICHROME_W48,0.753419
