# Adventure is out there!

It looks like a lot of these are gene level tables, with all patients aggregated together. But we want to look at each patient individually.

In [1]:
import pcprutils as ut
import altair as alt

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Load formatted proteomics and transcriptomics tables

In [2]:
tables = ut.load_prot_trans(["hnscc"])

                                          

In [3]:
for name, df in tables.items():
    print(f"{name}: {df.shape}")

hnscc: (1454346, 5)


## Graphs

In [4]:
df = df.assign(Patient_ID=df["Patient_ID"].str.replace(".N", "", regex=False))

In [5]:
def make_regplot(pdf):
    scatter = alt.Chart(pdf).mark_circle().encode(
        x="Transcriptomics",
        y="Proteomics",
        color="Tissue",
        tooltip=["Gene"]
    )

    scatter = scatter + scatter.transform_regression(
        "Transcriptomics",
        "Proteomics"
    ).mark_line().transform_fold(
         ["regression line"], 
         as_=["regression", "y"]
    ).encode(alt.Color("regression:N"))

    chart = scatter.facet(
        columns=2,
        facet=alt.Facet("Tissue")
    ).resolve_scale(
        x="independent",
        y="independent",
    )

    return chart

In [6]:
norms = df[df["Tissue"] == "Normal"]["Patient_ID"].drop_duplicates(keep="first")

In [7]:
charts = []
for norm in norms[:10]:
    pdf = df[df["Patient_ID"] == norm]
    charts.append(make_regplot(pdf))
    
a = alt.vconcat(*charts)

In [12]:
a.save("test.html")

In [8]:
ndf = df[df["Patient_ID"].isin(norms)]

In [9]:
ndf

Unnamed: 0,Patient_ID,Gene,Proteomics,Tissue,Transcriptomics
2,C3L-00994,A1BG,28.348186,Tumor,5.14
3,C3L-00995,A1BG,28.004445,Tumor,5.69
4,C3L-00997,A1BG,27.735214,Tumor,4.54
5,C3L-00999,A1BG,27.949122,Tumor,4.89
7,C3L-01237,A1BG,28.216073,Tumor,5.42
8,C3L-02617,A1BG,27.452281,Tumor,4.26
9,C3L-02621,A1BG,27.701040,Tumor,6.21
10,C3L-02651,A1BG,27.616140,Tumor,4.97
11,C3L-03378,A1BG,27.756237,Tumor,4.82
17,C3N-00204,A1BG,27.827772,Tumor,7.49


In [10]:
cutoff = 15
# if cancer_name == 'Endometrial':
#     cutoff = 10

def get_corr(group):
    res = group.corr(method="spearman", min_periods=cutoff)
    return res.iloc[0, 1]

corr = ndf.\
groupby(["Patient_ID", "Tissue"]).\
apply(get_corr).\
to_frame().\
reset_index().\
pivot(index="Patient_ID", columns="Tissue", values=0)

corr = corr.assign(diff=corr["Tumor"] - corr["Normal"])

corr

Tissue,Normal,Tumor,diff
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3L-00994,0.449391,0.465987,0.016597
C3L-00995,0.435475,0.434951,-0.000524
C3L-00997,0.434359,0.471041,0.036683
C3L-00999,0.434085,0.434559,0.000474
C3L-01237,0.453892,0.451955,-0.001937
C3L-02617,0.429887,0.422181,-0.007706
C3L-02621,0.436097,0.43191,-0.004187
C3L-02651,0.427349,0.448294,0.020945
C3L-03378,0.435145,0.452945,0.0178
C3N-00204,0.452002,0.453287,0.001285


In [11]:
alt.Chart(corr).mark_boxplot().encode(
    y="diff"
).properties(
    width=100,
    height=500,
)