# Calculate event statistics

Such as what proportion of patients in each cancer type have the event.

In [1]:
import altair as alt
import cnvutils
import os
import pandas as pd

In [2]:
# Load parameters
gen_params = cnvutils.load_params(os.path.join("..", "..", "..", "data", "gen_params.json"))
PANCAN = gen_params["PANCAN"]

chr_params = cnvutils.load_params(os.path.join("..", "..", "data", "chr_params.json"))
CHROMOSOME = chr_params["CHROMOSOME"]

arm_params = cnvutils.load_params(os.path.join("..", "data", "arm_params.json"))
ARM = arm_params["ARM"]
EVENT_TYPE = arm_params["TYPE"]
CANCER_TYPES = arm_params["ARM_CANCER_TYPES"]

In [3]:
# Get the total number of samples in each cancer type. We saved this earlier.
total_samples = pd.read_csv(os.path.join(
    "..", 
    "..",
    "data", 
    f"chr{CHROMOSOME:0>2}_cnv_counts_{'harmonized' if PANCAN else 'AWG'}.tsv"
), sep='\t', index_col=0)[["cancer", "cancer_type_total_patients"]].\
drop_duplicates(keep="first").\
reset_index(drop=True)

total_samples

Unnamed: 0,cancer,cancer_type_total_patients
0,brca,122
1,ccrcc,110
2,coad,106
3,gbm,99
4,hnscc,110
5,lscc,110
6,luad,110
7,ov,82
8,pdac,140
9,ucec,95


In [4]:
# Load arm event table
has_event = pd.DataFrame()
for cancer_type in CANCER_TYPES:
    
    cancer_type_event = pd.read_csv(os.path.join(
        "..", 
        "data", 
        f"chr{CHROMOSOME:0>2}{ARM}_{cancer_type}_has_event_{'harmonized' if PANCAN else 'AWG'}.tsv"
    ), sep='\t', index_col=0).\
    assign(cancer_type=cancer_type)
    
    has_event = has_event.append(cancer_type_event)
    
has_event

Unnamed: 0,event,proportion,cancer_type
01BR001,True,1.000000,brca
01BR008,False,0.000000,brca
01BR009,False,0.221428,brca
01BR010,False,0.000000,brca
01BR015,True,1.000000,brca
01BR017,False,0.000000,brca
01BR018,True,1.000000,brca
01BR020,True,1.000000,brca
01BR023,True,0.988175,brca
01BR025,False,0.000000,brca


In [5]:
# Get event counts
event_counts = has_event.\
groupby("cancer_type")["event"].\
value_counts().\
rename("count").\
reset_index().\
merge(
    total_samples,
    left_on="cancer_type",
    right_on="cancer"
).\
drop(columns="cancer")

event_counts.insert(
    3,
    "proportion",
    (event_counts["count"] / event_counts["cancer_type_total_patients"]).round(2)
)

event_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
0,brca,False,77,0.63,122
1,brca,True,45,0.37,122
2,coad,False,72,0.68,106
3,coad,True,34,0.32,106
4,hnscc,False,80,0.73,110
5,hnscc,True,30,0.27,110
6,lscc,False,67,0.61,110
7,lscc,True,43,0.39,110
8,luad,False,76,0.69,110
9,luad,True,34,0.31,110


In [6]:
true_counts = event_counts[event_counts["event"]]
true_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
1,brca,True,45,0.37,122
3,coad,True,34,0.32,106
5,hnscc,True,30,0.27,110
7,lscc,True,43,0.39,110
9,luad,True,34,0.31,110
11,ov,True,32,0.39,82


In [7]:
false_counts = event_counts[~event_counts["event"]]
false_counts

Unnamed: 0,cancer_type,event,count,proportion,cancer_type_total_patients
0,brca,False,77,0.63,122
2,coad,False,72,0.68,106
4,hnscc,False,80,0.73,110
6,lscc,False,67,0.61,110
8,luad,False,76,0.69,110
10,ov,False,50,0.61,82


### Plot the results

In [8]:
joined_counts = true_counts.append(false_counts)

has_event_chart = alt.Chart(joined_counts).mark_bar().encode(
    x="event",
    y="count",
    color="event",
    column="cancer_type"
)

has_event_chart

In [9]:
# Save the chart
chart_fmt = gen_params["CHART_FORMAT"]
chart_scale = gen_params["CHART_SCALE"]

chart_dir = os.path.join("..", "data", "charts_img")
has_event_chart_path = os.path.join(
    chart_dir,
    f"chr{CHROMOSOME}{ARM}_{EVENT_TYPE}_has_event_chart_{'harmonized' if PANCAN else 'AWG'}_altair.{chart_fmt}"
)

has_event_chart.save(has_event_chart_path, scale_factor=chart_scale)

INFO:tornado.access:200 GET / (::1) 11.87ms
INFO:tornado.access:200 GET / (::1) 11.87ms
INFO:tornado.access:200 GET /vega.js (::1) 12.73ms
INFO:tornado.access:200 GET /vega.js (::1) 12.73ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 4.74ms
INFO:tornado.access:200 GET /vega-lite.js (::1) 4.74ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 5.63ms
INFO:tornado.access:200 GET /vega-embed.js (::1) 5.63ms


# Make a histogram of how well patients divide between having and not have the event

In [10]:
has_event

Unnamed: 0,event,proportion,cancer_type
01BR001,True,1.000000,brca
01BR008,False,0.000000,brca
01BR009,False,0.221428,brca
01BR010,False,0.000000,brca
01BR015,True,1.000000,brca
01BR017,False,0.000000,brca
01BR018,True,1.000000,brca
01BR020,True,1.000000,brca
01BR023,True,0.988175,brca
01BR025,False,0.000000,brca


In [11]:
# All together
all_cancers_histo = alt.Chart(has_event).mark_bar().encode(
    x=alt.X(
        "proportion",
        bin=True
    ),
    y="count()",
).properties(
    title=f"Frequency of {EVENT_TYPE} of different proportions of chromosome {CHROMOSOME}{ARM}"
)

all_cancers_histo

In [12]:
# Save the chart
all_cancers_histo_path = os.path.join(
    chart_dir,
    f"chr{CHROMOSOME}{ARM}_{EVENT_TYPE}_arm_proportion_histogram_all_cancers_{'harmonized' if PANCAN else 'AWG'}_altair.{chart_fmt}"
)

all_cancers_histo.save(all_cancers_histo_path, scale_factor=chart_scale)

INFO:tornado.access:304 GET / (::1) 0.75ms
INFO:tornado.access:304 GET / (::1) 0.75ms
INFO:tornado.access:304 GET /vega.js (::1) 2.20ms
INFO:tornado.access:304 GET /vega.js (::1) 2.20ms
INFO:tornado.access:304 GET /vega-lite.js (::1) 1.39ms
INFO:tornado.access:304 GET /vega-lite.js (::1) 1.39ms
INFO:tornado.access:304 GET /vega-embed.js (::1) 2.95ms
INFO:tornado.access:304 GET /vega-embed.js (::1) 2.95ms


In [13]:
# By cancer type
split_cancers_histo = alt.Chart(has_event).mark_bar().encode(
    x=alt.X(
        "proportion",
        bin=True
    ),
    y="count()",
    row="cancer_type",
).properties(
    title=[
        f"Frequency of {EVENT_TYPE} of different proportions of chromosome {CHROMOSOME}{ARM}",
        "Separated by cancer type"
    ]
).configure_title(
    anchor="middle"
)

split_cancers_histo

In [14]:
# Save the chart
split_cancers_histo_path = os.path.join(
    chart_dir,
    f"chr{CHROMOSOME}{ARM}_{EVENT_TYPE}_split_cancers_arm_proportion_histogram_split_cancers_{'harmonized' if PANCAN else 'AWG'}_altair.{chart_fmt}"
)

split_cancers_histo.save(split_cancers_histo_path, scale_factor=chart_scale)

INFO:tornado.access:304 GET / (::1) 0.39ms
INFO:tornado.access:304 GET / (::1) 0.39ms
INFO:tornado.access:304 GET /vega.js (::1) 1.72ms
INFO:tornado.access:304 GET /vega.js (::1) 1.72ms
INFO:tornado.access:304 GET /vega-lite.js (::1) 3.33ms
INFO:tornado.access:304 GET /vega-lite.js (::1) 3.33ms
INFO:tornado.access:304 GET /vega-embed.js (::1) 4.49ms
INFO:tornado.access:304 GET /vega-embed.js (::1) 4.49ms
