In [1]:
import altair as alt
import os
import pandas as pd

In [2]:
awg = pd.read_csv(os.path.join("data", "cnv_counts.tsv"), sep='\t')
awg = awg.drop(columns=["Database_ID"])
awg = awg[awg["Name"] == "EGFR"].drop_duplicates(keep="first")
awg = awg.assign(
    value_prop=awg["value"] / awg["cancer_type_total_patients"] * 100,
    dataset="awg",
)
awg

Unnamed: 0,Name,start_bp,end_bp,variable,value,cancer_type_total_patients,cancer,value_prop,dataset
644,EGFR,55019017.0,55211628.0,gain,27,122,brca,22.131148,awg
645,EGFR,55019017.0,55211628.0,loss,9,122,brca,7.377049,awg
2570,EGFR,55019017.0,55211628.0,gain,28,110,ccrcc,25.454545,awg
2571,EGFR,55019017.0,55211628.0,loss,2,110,ccrcc,1.818182,awg
5414,EGFR,55019017.0,55211628.0,gain,66,105,colon,62.857143,awg
5417,EGFR,55019017.0,55211628.0,loss,1,105,colon,0.952381,awg
10816,EGFR,55019017.0,55211628.0,gain,9,95,endometrial,9.473684,awg
10819,EGFR,55019017.0,55211628.0,loss,4,95,endometrial,4.210526,awg
16202,EGFR,55019017.0,55211628.0,gain,81,98,gbm,82.653061,awg
16205,EGFR,55019017.0,55211628.0,loss,0,98,gbm,0.0,awg


In [3]:
harmonized = pd.read_csv(os.path.join("data", "cnv_counts_pancan.tsv"), sep='\t')
# harmonized = pd.read_csv(os.path.join("data", "cnv_counts.tsv"), sep='\t')
harmonized = harmonized.drop(columns=["Database_ID"])
harmonized = harmonized[harmonized["Name"] == "EGFR"].drop_duplicates(keep="first")
harmonized = harmonized.assign(
    value_prop=harmonized["value"] / harmonized["cancer_type_total_patients"] * 100,
    dataset="harmonized",
)
harmonized

Unnamed: 0,Name,start_bp,end_bp,variable,value,cancer_type_total_patients,cancer,value_prop,dataset
514,EGFR,55019017.0,55211628.0,gain,12,122,brca,9.836066,harmonized
515,EGFR,55019017.0,55211628.0,loss,9,122,brca,7.377049,harmonized
2202,EGFR,55019017.0,55211628.0,gain,28,110,ccrcc,25.454545,harmonized
2203,EGFR,55019017.0,55211628.0,loss,1,110,ccrcc,0.909091,harmonized
3890,EGFR,55019017.0,55211628.0,gain,46,106,colon,43.396226,harmonized
3891,EGFR,55019017.0,55211628.0,loss,0,106,colon,0.0,harmonized
5578,EGFR,55019017.0,55211628.0,gain,13,95,endometrial,13.684211,harmonized
5579,EGFR,55019017.0,55211628.0,loss,3,95,endometrial,3.157895,harmonized
7266,EGFR,55019017.0,55211628.0,gain,75,99,gbm,75.757576,harmonized
7267,EGFR,55019017.0,55211628.0,loss,0,99,gbm,0.0,harmonized


In [4]:
df = awg.append(harmonized)

In [13]:
def make_plot(df, event_type):

    df = df[df["variable"] == event_type]

    return alt.Chart(df).mark_bar().encode(
        x=alt.X(
            "dataset:N",
            axis=alt.Axis(
                title=None,
                labelAngle=50
            ),
        ),
        y=alt.Y(
            "value_prop",
            axis=alt.Axis(
                title=f"% patients w/ {event_type} event"
            ),
            scale=alt.Scale(
                domain=(0, 100)
            )
        ),
        column=alt.Column(
            "cancer",
            title=None,
        ),
        color=alt.Color(
            "dataset",
            scale=alt.Scale(scheme="paired"),
        ),
    ).properties(title="")

alt.vconcat(
    make_plot(df, "gain"),
    make_plot(df, "loss"),
).properties(
    title="EGFR CNV counts in AWG vs harmonized datasets"
).configure_title(
    anchor="middle"
)