In [1]:
import pandas as pd
import pcprutils as ut
import numpy as np
import altair as alt

In [2]:
def get_resid(reg_df, prot_trans_df):
    merged = prot_trans_df.merge(
        reg_df,
        on=["Gene", "Tissue"],
        how="inner"
    )
    
    x = merged["Transcriptomics"]
    y = merged["Proteomics"]
    m = merged["m"]
    b = merged["b"]
    
    orth_m = -1 / m
    orth_b = y - orth_m * x
    int_x = (b - orth_b) / (orth_m - m)
    int_y = m * int_x + b
    
    d = np.sqrt((int_x - x) ** 2 + (int_y - y) ** 2)
    
    above_line = y > int_y
    
    merged = merged.assign(
        orth_resid=d,
        intersect_x=int_x,
        intersect_y=int_y,
        above_reg_line = above_line
    )
    
    return merged



In [3]:
def calculate_residuals(cancer_types):
    prot_trans = ut.load_prot_trans(cancer_types)
    for cancer_type in cancer_types:
        reg_df = pd.read_csv(f'{cancer_type}_regression.tsv', sep="\t")
        prot_trans_df = prot_trans[cancer_type]
        resid_df = get_resid(reg_df, prot_trans_df)
        resid_df.to_csv(
            f'{cancer_type}_residuals.tsv.gz', 
            sep='\t',
            compression='gzip',
            index=False
        )

In [4]:
calculate_residuals([
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
])

                                                

### Plot residuals as a sanity check

In [7]:
def plot_tissue_residuals(resid_df, gene, tissue):
    df = resid_df[(resid_df["Gene"] == gene) & (resid_df["Tissue"] == tissue)]
    
    tran_min = df["Transcriptomics"].min()
    tran_max = df["Transcriptomics"].max()
    tran_rng = tran_max - tran_min
    
    prot_min = df["Proteomics"].min()
    prot_max = df["Proteomics"].max()
    prot_rng = prot_max - prot_min
    
    xs = np.arange(
        tran_min,
        tran_max + 1,
    )
    
    color_hex = "#5778a4" if tissue == "Normal" else "#e49444"
    
    all_m = df["m"].drop_duplicates(keep="first")
    assert all_m.shape[0] == 1
    m = all_m.iloc[0]

    all_b = df["b"].drop_duplicates(keep="first")
    assert all_b.shape[0] == 1
    b = all_b.iloc[0]
    
    reg_line_df = pd.DataFrame({
        "x": xs,
        "y": m * xs + b,
    })
    
    reg_line = alt.Chart(reg_line_df).mark_line(color=color_hex).encode(
        x=alt.X("x",scale=alt.Scale(domain=[tran_min, tran_max])),
        y=alt.Y("y",scale=alt.Scale(domain=[prot_min, prot_max])),
    )
    
    pt_data = []
    for pid in df["Patient_ID"].drop_duplicates(keep="first"):
        
        pt_df = pd.DataFrame({
            "x": [
                df[df["Patient_ID"] == pid]["Transcriptomics"].iloc[0],
                df[df["Patient_ID"] == pid]["intersect_x"].iloc[0],
            ],
            "y": [
                df[df["Patient_ID"] == pid]["Proteomics"].iloc[0],
                df[df["Patient_ID"] == pid]["intersect_y"].iloc[0],
            ],
            "resid": [df[df["Patient_ID"] == pid]["orth_resid"].iloc[0]] * 2,
            "Patient_ID": [df[df["Patient_ID"] == pid]["Patient_ID"].iloc[0]] * 2,
        })
        
        pt_base = alt.Chart(pt_df).encode(
            x=alt.X("x",title="RNA abundance",scale=alt.Scale(domain=[tran_min, tran_max])),
            y=alt.Y("y",title="Protein abundance",scale=alt.Scale(domain=[prot_min, prot_max])),
            tooltip=["Patient_ID", "resid"],
        )
        
        pt_data.append(pt_base.mark_line(color=color_hex))
        pt_data.append(pt_base.mark_point(color=color_hex))
        
    return alt.layer(*([reg_line] + pt_data)), tran_rng, prot_rng
    

def plot_residuals(resid_df, gene):
    tchart, txr, tyr = plot_tissue_residuals(resid_df, gene, "Tumor")
    nchart, nxr, nyr = plot_tissue_residuals(resid_df, gene, "Normal")
    
    rng = max((txr, tyr), (nxr, nyr))
    
    return alt.layer(tchart, nchart).properties(
        width=nxr * 75,
        height=nyr * 75,
    ).resolve_scale(
        x="shared",
        y="shared"
    ).properties(
        title=["Tumor and normal RNA-protein correlations", "Protein: " + gene]
    )

plot_residuals(pd.read_csv("hnscc_residuals.tsv.gz", sep="\t"), "AADAC")