# SCNA analysis step 4: Classify and plot events

1. Calculate the proportion of its chromosome that each event covers
    1. Get the length of each chromosome in base pairs. 
        1. Since our CNA data is just for genes, should we only count the length of each chromosome that is included in genes?
    2. For each event, subtract the start from the end, then divide by the length of its chromosome.
2. Classify each event as arm level or not, based on whether the proportion of its chromosome that it covers passes a cutoff. 0.80 or 0.98? Plot the distribution to help decide.

## Setup

In [1]:
import cptac
import pandas as pd
import numpy as np
import datetime
import os
import altair as alt
from toolz.curried import pipe

TIME_START = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

STEP3_DIR = "03_outputs"
STEP3_FILE_NAME = "summary_20200706_142325_from_locations_scna_cutoff_0.2_20200706_092210.tsv.gz"
STEP3_FILE_PATH = os.path.join(STEP3_DIR, STEP3_FILE_NAME)

STEP4_DIR = "04_outputs"
if not os.path.isdir(STEP4_DIR):
    os.mkdir(STEP4_DIR)
    
STEP4_FILE_PATH = os.path.join(STEP4_DIR, f"summary_{TIME_START}_from_{STEP3_FILE_NAME}")

In [2]:
# Altair options
alt.data_transformers.disable_max_rows()

def json_dir(data, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

alt.data_transformers.register("json_dir", json_dir)
alt.data_transformers.enable("json_dir", data_dir=STEP4_DIR)

DataTransformerRegistry.enable('json_dir')

In [3]:
summary = pd.read_csv(STEP3_FILE_PATH, sep="\t")

In [4]:
brca = summary[summary["cancer_type"] == "br"]

In [5]:
brca

Unnamed: 0,chromosome,cancer_type,Patient_ID,genes,start,end,num_genes,avg_cna
0,1,br,CPT000814,"['DDX11L1', 'FAM138A', 'OR4F5', 'OR4F29', 'OR4...",11869.0,3438621.0,77,-0.270000
1,1,br,CPT000814,['RN7SL371P'],30918469.0,30918735.0,1,-0.471000
2,1,br,CPT000814,"['PUM1', 'NKAIN1', 'SNRNP40', 'ZCCHC17', 'FABP...",30931506.0,31632518.0,8,1.259000
3,1,br,CPT000814,"['CSF3R', 'GRIK3', 'MIR4255', 'RNA5SP43', 'ZC3...",36466043.0,38859772.0,26,1.930692
4,1,br,CPT000814,"['NOTCH2', 'SEC22B', 'PPIAL4A', 'LINC00623', '...",119911553.0,155934413.0,279,1.759244
5,1,br,CPT000814,"['RXFP4', 'ARHGEF2', 'SSR2', 'UBQLN4', 'LAMTOR...",155941710.0,203744081.0,452,1.680850
6,1,br,CPT000814,['SNORA77'],203729581.0,203729705.0,1,-0.484000
7,1,br,CPT000814,"['SNORA77', 'LAX1', 'ZC3H11A', 'ZBED6', 'SNRPE...",203729581.0,228555901.0,219,0.544452
8,1,br,CPT000814,['RNA5SP162'],228558296.0,228558339.0,1,-0.500000
9,1,br,CPT000814,"['RNA5S1', 'RNA5S2', 'RNA5S3', 'RNA5S4', 'RNA5...",228610268.0,228746664.0,20,0.585000


In [51]:
def plot_patients_facet(cancer_table, chromosome):
    table = cancer_table[cancer_table["chromosome"] == chromosome]
    table = table[["Patient_ID", "avg_cna", "start", "end"]]
    
#     return table.sort_values(by=["Patient_ID", "start"])
    
    table = table.melt(id_vars=["Patient_ID", "avg_cna"], value_name="location")
    table = table[["avg_cna", "location", "Patient_ID"]].\
        sort_values(by=["Patient_ID", "location"])
    
    return table
    
    cna_vals = []
    locations = []
    pids = []
    
    current_cna = 0
    
    for row in table.itertuples(index=False):
        if row.avg_cna == current_cna:
            locations.append(row.location + 0.01)
        else:
            locations.append(row.location - 0.01)
            
        cna_vals.append(0)
        pids.append(row.Patient_ID)
        
        current_cna = row.avg_cna
        
    zeros = pd.DataFrame({
        "avg_cna": cna_vals,
        "location": locations,
        "Patient_ID": pids
    })
    
    table = table.\
        append(zeros).\
        sort_values(by=["Patient_ID", "location"])
    
    cna_line = alt.Chart().mark_line().encode(
        x=alt.X(
            "location"
        ),
        y=alt.Y(
            "avg_cna",
            scale=alt.Scale(domain=(-3, 3))
        )
    )
    
    horiz_line = alt.Chart().mark_rule(color=alt.Value("#d10000")).encode(
        y="a:Q",
    )
    
    chart = alt.layer(
        cna_line,
        horiz_line,
        data=table
    ).transform_calculate(
        a="0"
    ).properties(
        height=100
    ).facet(
        row="Patient_ID",
        spacing=-5
    )
    
    return chart

pd.options.display.float_format = '{:,.2f}'.format
plot_patients_facet(brca, "8").head(16)

Unnamed: 0,avg_cna,location,Patient_ID
0,-0.55,166049.0,CPT000814
418,-0.55,36936125.0,CPT000814
1,2.22,37695782.0,CPT000814
419,2.22,47068323.0,CPT000814
2,-0.49,47260878.0,CPT000814
420,-0.49,47960178.0,CPT000814
3,0.46,47960185.0,CPT000814
421,0.46,60281400.0,CPT000814
4,0.46,60678740.0,CPT000814
422,0.46,85449040.0,CPT000814


In [None]:
def plot_patients_overlaid(cancer_table, chromosome):
    table = cancer_table[cancer_table["chromosome"] == chromosome]
    table = table[["Patient_ID", "start", "end", "avg_cna"]]
    
    table = table.melt(id_vars=["Patient_ID", "avg_cna"], value_name="location")
    table = table[["Patient_ID", "avg_cna", "location"]]
    
    chart = alt.Chart(table).mark_line().encode(
        x="location",
        y="avg_cna",
        color="Patient_ID",
        stroke="Patient_ID"
    ).properties(
        width=900,
        height=700
    )
    
    return chart

# Sort samples by average CNV val for whole sample
# Add vertical line
# Functionize

In [None]:
plot_patients_overlaid(brca, "8")

## Get chromosome lengths

In [None]:
chrs = pd.DataFrame({
    "chromosome": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
                   "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"],
    "chrom_len_bp": [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 
               145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718,
               101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468,
               156040895, 57227415]
})

In [None]:
# File downloaded from https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=856734045_VsAQvZHdBDGH2PuUBBiicYCEoYFm&clade=mammal&org=Human&db=hg38&hgta_group=allTables&hgta_track=hg38&hgta_table=cytoBand&hgta_regionType=genome&position=chrX%3A15%2C560%2C138-15%2C602%2C945&hgta_outputType=primaryTable&hgta_outFileName=
nchrs = pd.read_csv("ucsc_cytoband.tsv", sep="\t")

In [None]:
nchrs

In [None]:
summary = summary.merge(
    right=chrs,
    on="chromosome",
    how="outer",
    validate="many_to_one"
)

summary = summary.assign(
    prop_chromosome=(summary["end"] - summary["start"]) / summary["chrom_len_bp"]
)

In [None]:
alt.Chart(summary).mark_bar().encode(
    x=alt.X(
        "prop_chromosome:Q", 
        title="Proportion of chromosome covered",
        bin=alt.Bin(step=0.01)
    ),
    y=alt.Y(
        "count()",
        title="Number of events"
    )
).properties(
    width=700,
    height=500,
    title="Distribution of CNA event sizes"
)

In [None]:
summary[summary["prop_chromosome"] > 0.98].\
    groupby(["Patient_ID", "cancer_type"])[["chromosome"]].\
    agg(len).\
    sort_values(by=["chromosome", "cancer_type", "Patient_ID"], ascending=[False, True, True])

In [None]:
summary[summary["prop_chromosome"] > 0.98].sort_values(by=["Patient_ID", "chromosome", "cancer_type"])

In [None]:
cancers = summary.groupby("cancer_type")[["start"]].agg(len).reset_index()
cancers["cancer_type"] = cancers["cancer_type"].\
    replace({
        "br": "BRCA",
        "cc": "CCRCC",
        "co": "Colon",
        "en": "Endometrial",
        "gb": "GBM",
        "hn": "HNSCC",
        "ls": "LSCC",
        "lu": "LUAD",
        "ov": "Ovarian"
    })

In [None]:
alt.Chart(cancers).mark_bar().encode(
    x=alt.X(
        "cancer_type",
        title="Cancer type"
    ),
    y=alt.Y(
        "start",
        title="Number of CNA events"
    )
).properties(
    title="Events per cancer type"
)

In [None]:
chrs = summary.groupby("chromosome")[["start"]].agg(len).reset_index()

# What are the weird chromosomes?

In [None]:
chrs = chrs[~chrs["chromosome"].isin(["GL000220.1", "KI270733.1"])]

In [None]:
chrs_sort = sorted([int(x) for x in chrs["chromosome"].values if x.isdigit()]) + ["X", "Y"]
chrs_sort = [str(x) for x in chrs_sort]

In [None]:
alt.Chart(chrs).mark_bar().encode(
    x=alt.X(
        "chromosome",
        title="Chromosome",
        sort=chrs_sort
    ),
    y=alt.Y(
        "start",
        title="Number of CNA events"
    )
).properties(
    title="Events per chromosome across all cancer types"
)