# Characteristics table

Filling in the table for metadata

In [1]:
# 0 - imports
import matplotlib.pyplot as plt
import palettable as pal
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
from lifelines import KaplanMeierFitter
from lifelines.utils import median_survival_times

from analysis.utils import load_pre_post_processing, robust_z_score
import analysis.cluster as clu
import analysis.visualizations as vis
from analysis.cox import CoxModel

%matplotlib inline
mpl.rcParams['figure.dpi'] = 200
plt.rcParams['figure.facecolor'] = 'white'

palette = pal.cartocolors.qualitative.Safe_8.mpl_colors
scolormap = pal.scientific.sequential.Batlow_13.mpl_colormap
dcolormap = pal.scientific.diverging.Vik_18.mpl_colormap

In [2]:
cancer_nuhif_data, fibroblast_nuhif_data, lymphocyte_nuhif_data, meta_data = load_pre_post_processing("../")

# get into the metadata

In [3]:
def age_median_range(meta):
    data = meta["age_at_initial_pathologic_diagnosis"]
    n_missing = np.isnan(data.values).sum()
    data_values = data.values[~np.isnan(data.values)]
    # print(f"missing {n_missing}")
    if n_missing > 0:
        print(f"missing slides: {data[pd.isna(data)].index.values}, n = {len(data[pd.isna(data)].index.values)}")
    print(f"Median Age, range, n missing: {np.median(data_values)}, ({np.min(data_values)}, {np.max(data_values)})")
    
    
def sex_pct_missing(meta):
    data = meta["gender"]
    data_values = data.values
    counter = Counter(data_values)
    print(counter)
    print(f"Male = {counter['MALE']}, {np.round(100*counter['MALE']/len(data_values), 1)}%")
   

def stage_pct_missing(meta):
    data = meta['ajcc_pathologic_tumor_stage']
    counter = Counter(data)
    for key in sorted(counter.keys()):
        print(f"{key}: {counter[key]}")
        
def wgd_pct_missing(meta):
    data = meta["genome_doublings"]
    print("genome doublings")
    n_missing = np.isnan(data.values).sum()
    data_values = data.values[~np.isnan(data.values)]
    counter = Counter(data_values)
    #print(counter)
    counter2 = {str(key): counter[key] for key in counter.keys()}
    for key in sorted(counter2.keys()):
        print(f"{key[:1]}: {counter2[key]}, {np.round(100*counter2[key]/(len(data_values)), 1)}%")
    print(f"n missing = {n_missing}")

        
def os_missing(meta):
    meta_trim = meta[["os", "os_time", "vital_status"]]
    meta_na = meta_trim.loc[pd.isna(meta_trim).any(axis=1)].copy()
    meta_trim = meta_trim.loc[~pd.isnull(meta_trim).any(axis=1)].copy()


    km_fitter = KaplanMeierFitter()
    km_fitter.fit(meta_trim["os_time"], meta_trim["os"], label="OS")
    print(f"median os = {median_survival_times(km_fitter)/30} mo; missing n = {len(meta_na)}")
    
def pfs_missing(meta):
    meta_trim = meta[["pfs", "pfs_time", "vital_status"]]
    meta_na = meta_trim.loc[pd.isna(meta_trim).any(axis=1)].copy()
    meta_trim = meta_trim.loc[~pd.isnull(meta_trim).any(axis=1)].copy()

    km_fitter = KaplanMeierFitter()
    km_fitter.fit(meta_trim["pfs_time"], meta_trim["pfs"], label="OPF")
    print(f"median pfs = {median_survival_times(km_fitter)/30} mo; missing n = {len(meta_na)}")

In [4]:
for name in ["brca", "luad", "prad"]:
    meta = meta_data[name]
    print(name)
    print(f"Total N = {len(meta)}")
    age_median_range(meta)
    sex_pct_missing(meta)
    stage_pct_missing(meta)
    wgd_pct_missing(meta)
    pfs_missing(meta)
    os_missing(meta)
    
    
    print("\n")

brca
Total N = 886
Median Age, range, n missing: 58.0, (26, 90)
Counter({'FEMALE': 876, 'MALE': 10})
Male = 10, 1.1%
Stage I: 77
Stage IA: 76
Stage IB: 5
Stage II: 6
Stage IIA: 286
Stage IIB: 207
Stage III: 2
Stage IIIA: 128
Stage IIIB: 18
Stage IIIC: 51
Stage IV: 13
Stage X: 8
[Discrepancy]: 4
[Not Available]: 5
genome doublings
0: 435, 56.5%
1: 298, 38.7%
2: 37, 4.8%
n missing = 116
median pfs = 122.3 mo; missing n = 0
median os = 131.36666666666667 mo; missing n = 0


luad
Total N = 426
Median Age, range, n missing: 66.0, (33, 88)
Counter({'FEMALE': 235, 'MALE': 191})
Male = 191, 44.8%
Stage I: 5
Stage IA: 122
Stage IB: 108
Stage II: 1
Stage IIA: 47
Stage IIB: 58
Stage IIIA: 50
Stage IIIB: 5
Stage IV: 22
[Discrepancy]: 7
[Not Available]: 1
genome doublings
0: 168, 41.9%
1: 193, 48.1%
2: 40, 10.0%
n missing = 25
median pfs = 29.3 mo; missing n = 0
median os = 48.46666666666667 mo; missing n = 0


prad
Total N = 392
Median Age, range, n missing: 61.0, (41, 77)
Counter({'MALE': 392})
M