# Calculate event statistics

Such as what proportion of patients in each cancer type have the event.

In [1]:
import cnvutils
import os
import pandas as pd

In [2]:
# Load parameters
gen_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
ARM = arm_params["ARM"]
CANCER_TYPES = arm_params["ARM_CANCER_TYPES"]

In [3]:
# Get the total number of samples in each cancer type. We saved this earlier.
total_samples = pd.read_csv(os.path.join(
    "..", 
    "..",
    "data", 
    f"chr{CHROMOSOME:0>2}_cnv_counts_{'harmonized' if PANCAN else 'AWG'}.tsv"
), sep='\t', index_col=0)[["cancer", "cancer_type_total_patients"]].\
drop_duplicates(keep="first").\
reset_index(drop=True)

total_samples

Unnamed: 0,cancer,cancer_type_total_patients
0,brca,122
1,ccrcc,110
2,colon,106
3,endometrial,95
4,gbm,99
5,hnscc,110
6,lscc,110
7,luad,110
8,ovarian,82


In [4]:
# Load arm event table
has_event = pd.DataFrame()
for cancer_type in CANCER_TYPES:
    
    cancer_type_event = pd.read_csv(os.path.join(
        "..", 
        "data", 
        f"chr{CHROMOSOME}_{cancer_type}_has_event_{'harmonized' if PANCAN else 'AWG'}.tsv"
    ), sep='\t', index_col=0).\
    assign(cancer_type=cancer_type)
    
    has_event = has_event.append(cancer_type_event)
    
has_event

Unnamed: 0,event,cancer_type
01BR001,True,brca
01BR008,False,brca
01BR009,False,brca
01BR010,False,brca
01BR015,True,brca
01BR017,False,brca
01BR018,False,brca
01BR020,True,brca
01BR023,True,brca
01BR025,False,brca


In [5]:
# Get event counts
event_counts = has_event.\
groupby("cancer_type")["event"].\
value_counts().\
rename("count").\
reset_index().\
merge(
    total_samples,
    left_on="cancer_type",
    right_on="cancer"
).\
drop(columns="cancer")

event_counts.insert(
    3,
    "proportion",
    (event_counts["count"] / event_counts["cancer_type_total_patients"]).round(2)
)

event_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
0,brca,False,86,0.7,122
1,brca,True,36,0.3,122
2,colon,False,68,0.64,106
3,colon,True,38,0.36,106
4,hnscc,False,68,0.62,110
5,hnscc,True,42,0.38,110
6,lscc,False,85,0.77,110
7,lscc,True,25,0.23,110
8,luad,False,81,0.74,110
9,luad,True,29,0.26,110


In [6]:
true_counts = event_counts[event_counts["event"]]
true_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
1,brca,True,36,0.3,122
3,colon,True,38,0.36,106
5,hnscc,True,42,0.38,110
7,lscc,True,25,0.23,110
9,luad,True,29,0.26,110
11,ovarian,True,26,0.32,82


In [7]:
false_counts = event_counts[~event_counts["event"]]
false_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
0,brca,False,86,0.7,122
2,colon,False,68,0.64,106
4,hnscc,False,68,0.62,110
6,lscc,False,85,0.77,110
8,luad,False,81,0.74,110
10,ovarian,False,56,0.68,82
