# Notebook with ov. stats. and nonparam. analysis of cohorts

In [1]:
import matplotlib as plt
import numpy as np 
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
import os
import sys

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
    
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from preprocessing.data_loader import DataLoader
from preprocessing.data_container import DataContainer
from sklearn.preprocessing import OneHotEncoder

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'intersection',
    'use_imputed': True,
    'clinical_covs' : ['TISSUE']
}

dc = DataContainer(DATA_CONFIG, PROJECT_ROOT)

In [8]:
dc.load_data()

2024-11-22 18:50:49,000 - INFO - Loading data...
2024-11-22 18:53:22,293 - INFO - Loaded data: 1091 samples, 13214 features


(                                      ENSG00000000003  ENSG00000000005  \
 Atlanta_2014_Long.PT081                     -3.006631        -0.622789   
 Atlanta_2014_Long.PT127                      0.044560        -0.622789   
 Atlanta_2014_Long.PT168                      0.975390        -0.622789   
 Atlanta_2014_Long.PT184                      0.744177        -0.622789   
 Atlanta_2014_Long.PT199                      0.316196        -0.622789   
 ...                                               ...              ...   
 Stockholm_2016_Ross_Adams.STKHLM8462         1.054221        -0.221158   
 Stockholm_2016_Ross_Adams.STKHLM8659        -0.376987        -0.595446   
 Stockholm_2016_Ross_Adams.STKHLM9157        -0.096840         0.003729   
 Stockholm_2016_Ross_Adams.STKHLM9161        -0.084071        -0.351288   
 Stockholm_2016_Ross_Adams.STKHLM9246         1.247314         0.032128   
 
                                       ENSG00000000419  ENSG00000000457  \
 Atlanta_2014_Long.PT08

In [3]:
dl = DataLoader(PROJECT_ROOT)

In [106]:
pdata = dl.merged_pdata_imputed['merged_imputed_pData.csv']
X = dl.intersection_data['exprs_intersect.csv']

In [107]:
clin_data = pdata.loc[:, DATA_CONFIG['clinical_covs']] 
ohc = OneHotEncoder()
cat_cols = clin_data.select_dtypes(exclude=['number']).columns
num_cols = clin_data.select_dtypes(exclude=['object']).columns
clin_data_cat = ohc.fit_transform(clin_data.loc[:, cat_cols])
clin_data_cat = pd.DataFrame.sparse.from_spmatrix(clin_data_cat, columns=ohc.get_feature_names_out()).set_index(X.index)
clin_data_num = clin_data.loc[:, num_cols]
#print(clin_data_cat)
#print(clin_data_num)
X = pd.concat([clin_data_cat, clin_data_num, X], axis = 1)

In [108]:
X

Unnamed: 0,AGE,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000277972,ENSG00000278053,ENSG00000278195,ENSG00000278259,ENSG00000278311,ENSG00000278318,ENSG00000278505,ENSG00000278535,ENSG00000278540,ENSG00000282608
Atlanta_2014_Long.PT081,78.00000,-3.006631,-0.622789,1.705887,0.234710,0.906281,-0.372955,-0.883206,-3.477133,-3.289121,...,-0.100000,1.874472,0.412771,-0.201057,-0.023212,1.346444,-0.100000,-0.459605,1.673088,0.236356
Atlanta_2014_Long.PT127,61.00000,0.044560,-0.622789,0.520111,0.840715,0.440554,0.035672,-0.932726,0.353031,0.178056,...,-0.100000,0.464558,1.473907,-0.201057,-1.225928,0.861876,-0.100000,-0.459605,1.066850,1.450024
Atlanta_2014_Long.PT168,63.00000,0.975390,-0.622789,0.104950,1.722182,-0.308974,-0.676291,-0.529297,0.471131,0.632298,...,-0.100000,0.297391,-0.500212,-0.201057,-1.225928,0.183742,-0.100000,-0.459605,1.766876,-1.059580
Atlanta_2014_Long.PT184,58.00000,0.744177,-0.622789,0.137887,0.569978,0.234893,-0.974835,0.080468,0.867739,-0.433796,...,-0.100000,-0.102693,1.919342,-0.201057,-1.225928,0.086797,-0.100000,-0.459605,-0.051815,0.795535
Atlanta_2014_Long.PT199,57.00000,0.316196,-0.622789,0.595612,0.282694,-0.553280,-0.819195,0.212234,0.694016,0.235932,...,-0.100000,1.265752,-0.500212,-0.201057,1.013715,-0.187605,-0.100000,-0.459605,0.601972,-1.059580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stockholm_2016_Ross_Adams.STKHLM8462,58.35854,1.054221,-0.221158,0.914313,1.166952,-0.987476,-0.614258,-0.199152,0.936988,-0.093688,...,1.259790,-0.785377,-0.393627,-0.087867,0.629437,1.177046,-1.055679,0.131266,-0.901919,-0.153249
Stockholm_2016_Ross_Adams.STKHLM8659,51.64000,-0.376987,-0.595446,-0.413672,0.974346,0.512080,-0.778478,0.691598,-1.282891,0.338737,...,-0.770326,-1.195006,-2.131437,-0.020858,-0.426975,0.448978,0.720650,-0.539557,0.044055,-0.225257
Stockholm_2016_Ross_Adams.STKHLM9157,64.25873,-0.096840,0.003729,-0.922901,-1.304991,0.077976,-1.057037,-1.738007,0.256888,0.718058,...,0.838754,-0.007334,-0.356743,0.636358,-0.564288,1.464345,-2.986711,0.773578,0.778639,-0.771943
Stockholm_2016_Ross_Adams.STKHLM9161,54.45428,-0.084071,-0.351288,-0.234087,-0.616688,-0.300022,-1.405359,-1.521836,0.442135,2.133783,...,0.816947,-0.640533,-0.966573,1.049117,-0.522911,0.139349,-1.012419,0.657811,1.829945,-0.561905


In [6]:
cols = []
cohorts_pData_list = []
cohorts_exprs_list = []
cohorts_summary_list = []
df_null = pd.DataFrame()

for c, pData in dl.pdata_original.items(): 
    counts_bcr = pData['BCR_STATUS'].value_counts()
    # print(pData['SURGICAL_PROCEDURE'].value_counts())
    # print(null_vals)
    null_vals = pData.isnull().sum().to_frame().T
    df_null = pd.concat([df_null, null_vals])
    summary = {
        'cohort' : c,
        'ratio_BCR_01': counts_bcr[1].item()/counts_bcr[0].item(), 
        'BCR_0' : counts_bcr[0].item(), 
        'BCR_1' : counts_bcr[1].item(), 
        'MONTH_BCR_mean' : np.mean(pData['MONTH_TO_BCR']),
        'MONTH_BCR_0' : np.mean(pData[pData['BCR_STATUS'] == 0]['MONTH_TO_BCR']).item(), 
        'MONTH_BCR_1' : np.mean(pData[pData['BCR_STATUS'] == 1]['MONTH_TO_BCR']).item(), 
        'gleason_mean' : np.mean(pData['GLEASON_SCORE'])
    }
    cohorts_pData_list.append(pData)
    cohorts_summary_list.append(summary)

In [18]:
df_null

Unnamed: 0,SAMPLE_ID,GSM_SAMPLE_ID,SRR_SAMPLE_ID,PAPER_SAMPLE_ID,SAMPLE_COUNT,AGE,STUDY,PLATFORM,TISSUE,SAMPLE_CLASS,...,STAGE,TMB_NONSYNONYMOUS,COMPLETE_DATA,COPY_NUMBER_CLUSTER,DFS_STATUS,ERG_FUSION_ACGH,ERG_FUSION_GEX,FRACTION_GENOME_ALTERED,SEQUENCED,SEQUENCING
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
0,0,0,248,0,0,0,0,0,0,0,...,,,,,,,,,,
0,0,0,112,0,0,0,0,0,0,0,...,,,,,,,,,,
0,0,0,133,0,0,133,0,0,0,0,...,,,,,,,,,,
0,0,0,73,0,73,0,0,0,0,0,...,,,,,,,,,,
0,0,120,120,0,0,0,0,0,0,0,...,,,,,,,,,,
0,0,82,82,0,0,0,0,0,0,0,...,0.0,0.0,,,,,,,,
0,0,0,131,0,0,0,0,0,0,0,...,,56.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0
0,0,0,92,0,0,89,0,0,0,0,...,,,,,,,,,,


In [26]:
dl.pdata_original['Atlanta_2014_Long.csv'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, PT081 to CM.4-0085
Data columns (total 55 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   SAMPLE_ID                    100 non-null    object 
 1   GSM_SAMPLE_ID                100 non-null    object 
 2   SRR_SAMPLE_ID                100 non-null    object 
 3   PAPER_SAMPLE_ID              100 non-null    object 
 4   SAMPLE_COUNT                 100 non-null    int64  
 5   AGE                          100 non-null    float64
 6   STUDY                        100 non-null    object 
 7   PLATFORM                     100 non-null    object 
 8   TISSUE                       100 non-null    object 
 9   SAMPLE_CLASS                 100 non-null    object 
 10  SAMPLE_TYPE                  100 non-null    object 
 11  SURGICAL_PROCEDURE           100 non-null    object 
 12  CLIN_TNM_STAGE               100 non-null    object 
 13  CLIN_T_STAGE   

In [7]:
pd.DataFrame(cohorts_summary_list)

Unnamed: 0,cohort,ratio_BCR_01,BCR_0,BCR_1,MONTH_BCR_mean,MONTH_BCR_0,MONTH_BCR_1,gleason_mean
0,Atlanta_2014_Long.csv,0.960784,51,49,53.4399,82.097843,23.612245,7.06
1,Belfast_2018_Jain.csv,0.291667,192,56,78.266129,83.911458,58.910714,7.487903
2,CamCap_2016_Ross_Adams.csv,0.204301,93,19,30.870902,33.500624,17.999105,6.9375
3,CancerMap_2017_Luca.csv,0.371134,97,36,50.714286,60.381443,24.666667,6.857143
4,CPC_GENE_2017_Fraser.csv,0.280702,57,16,74.19289,86.63307,29.87475,6.780822
5,CPGEA_2020_Li.csv,0.411765,85,35,27.129167,33.094118,12.642857,7.666667
6,DKFZ_2018_Gerhauser.csv,0.28125,64,18,33.05122,39.784375,9.111111,7.109756
7,MSKCC_2010_Taylor.csv,0.259615,104,27,48.189313,54.0675,25.547407,6.853846
8,Stockholm_2016_Ross_Adams.csv,0.957447,47,45,49.887337,75.180957,23.469556,7.044444


In [14]:
mean_list = []
sd_list = []

for c, exprs in dl.exprs_data.items(): 
    mean_exprs = exprs.mean(axis = 0)
    sd_exprs = exprs.std(axis = 0)
    mean_list.append(mean_exprs)
    sd_list.append(sd_exprs)
    

In [15]:
pd.DataFrame(mean_list)

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000271598,ENSG00000285304,ENSG00000273804,ENSG00000255726,ENSG00000260482,ENSG00000270955,ENSG00000233437,ENSG00000259813,ENSG00000285446,ENSG00000288646
0,2.88658e-17,1.565414e-16,-3.363976e-16,3.2196470000000005e-17,-2.273355e-16,-3.08642e-16,-3.58602e-16,6.2228e-16,2.089995e-16,1.632028e-16,...,,,,,,,,,,
1,-4.082756e-16,7.699934e-17,-1.146037e-16,5.30042e-16,-6.016693e-16,-1.432546e-15,-4.011128e-16,-1.86231e-16,5.01391e-17,3.43811e-16,...,,,,,,,,,,
2,-8.247371e-16,-1.062642e-15,2.3790490000000002e-17,2.775558e-16,-3.025358e-15,-6.780291e-16,5.868322e-16,-8.168069e-16,-7.176799e-16,-4.440892e-16,...,,,,,,,,,,
3,-5.342427e-17,1.18869e-15,6.544473e-16,1.429099e-15,1.12191e-15,1.676186e-15,-4.00682e-17,-1.3356070000000002e-17,-1.081841e-15,-1.906578e-15,...,,,,,,,,,,
4,1.134557e-15,-9.003452e-16,3.285043e-16,1.756586e-15,-3.178584e-16,-2.506366e-15,-4.167138e-16,-6.843841e-18,-1.595375e-15,-2.524617e-15,...,,,,,,,,,,
5,-4.440892e-16,-2.516506e-16,3.404684e-16,5.329071e-16,3.256654e-16,1.628327e-16,8.141636e-17,1.036208e-16,2.9605950000000004e-17,3.108624e-16,...,,,,,,,,,,
6,4.940154e-16,9.477514e-18,1.55702e-16,8.436680000000001e-17,-1.624717e-16,1.881963e-16,-8.428217e-17,4.718448e-16,2.355839e-16,6.065609e-16,...,1.814267e-16,3.01588e-16,-5.317562e-16,1.001909e-16,8.123583e-18,3.763927e-16,-1.137302e-16,-2.599547e-16,2.057974e-16,-5.5511150000000004e-17
7,2.305196e-16,-4.881591e-16,5.69519e-16,-1.355998e-16,-2.525546e-15,-1.220398e-15,2.305196e-16,-9.220784e-16,-8.678385e-16,2.162816e-15,...,,,,,,,,,,
8,-2.4135280000000002e-17,6.601e-16,1.586291e-15,2.22648e-16,1.351576e-16,8.447349e-16,1.041136e-15,1.71813e-16,-6.287241e-16,1.2131e-15,...,,,,,,,,,,
