# SCNA analysis step 4: Classify events

1. Calculate the proportion of its chromosome that each event covers
    1. Get the length of each chromosome in base pairs. 
        1. Since our CNA data is just for genes, should we only count the length of each chromosome that is included in genes?
    2. For each event, subtract the start from the end, then divide by the length of its chromosome.
2. Classify each event as arm level or not, based on whether the proportion of its chromosome that it covers passes a cutoff. 0.80 or 0.98? Plot the distribution to help decide.

## Setup

In [1]:
import cptac
import pandas as pd
import numpy as np
import datetime
import os
import altair as alt
from toolz.curried import pipe

TIME_START = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

STEP3_DIR = "03_outputs"
STEP3_FILE_NAME = "summary_20200706_142325_from_locations_scna_cutoff_0.2_20200706_092210.tsv.gz"
STEP3_FILE_PATH = os.path.join(STEP3_DIR, STEP3_FILE_NAME)

STEP4_DIR = "04_outputs"
if not os.path.isdir(STEP4_DIR):
    os.mkdir(STEP4_DIR)
    
STEP4_FILE_PATH = os.path.join(STEP4_DIR, f"summary_{TIME_START}_from_{STEP3_FILE_NAME}")

In [2]:
# Altair options
alt.data_transformers.disable_max_rows()

def json_dir(data, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

alt.data_transformers.register("json_dir", json_dir)
alt.data_transformers.enable("json_dir", data_dir=STEP4_DIR)

DataTransformerRegistry.enable('json_dir')

In [3]:
summary = pd.read_csv(STEP3_FILE_PATH, sep="\t")

## Get chromosome lengths

In [4]:
chrs = pd.DataFrame({
    "chromosome": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
                   "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"],
    "chrom_len_bp": [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 
               145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718,
               101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468,
               156040895, 57227415]
})

In [5]:
summary = summary.merge(
    right=chrs,
    on="chromosome",
    how="outer",
    validate="many_to_one"
)

summary = summary.assign(
    prop_chromosome=(summary["end"] - summary["start"]) / summary["chrom_len_bp"]
)

In [6]:
alt.Chart(summary).mark_bar().encode(
    x=alt.X(
        "prop_chromosome:Q", 
        title="Proportion of chromosome covered",
        bin=alt.Bin(step=0.01)
    ),
    y=alt.Y(
        "count()",
        title="Number of events"
    )
).properties(
    width=700,
    height=500,
    title="Distribution of CNA event sizes"
)

In [7]:
summary[summary["prop_chromosome"] > 0.98].\
    groupby(["Patient_ID", "cancer_type"])[["chromosome"]].\
    agg(len).\
    sort_values(by=["chromosome", "cancer_type", "Patient_ID"], ascending=[False, True, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,chromosome
Patient_ID,cancer_type,Unnamed: 2_level_1
C3N-00492,cc,14
C3N-01175,cc,12
C3N-01816,gb,10
C3N-00242,cc,9
C3N-01003,en,9
C3L-01286,cc,8
C3N-00337,en,7
C3N-00437,cc,6
C3N-00852,cc,6
C3N-01364,gb,6


In [8]:
summary[summary["prop_chromosome"] > 0.98].sort_values(by=["Patient_ID", "chromosome", "cancer_type"])

Unnamed: 0,chromosome,cancer_type,Patient_ID,genes,start,end,num_genes,avg_cna,chrom_len_bp,prop_chromosome
17074,12,co,01CO008,"['FAM138D', 'IQSEC3', 'SLC6A12', 'SLC6A13', 'K...",36602.0,133235877.0,1102,0.399014,133275309.0,0.999429
56616,3,co,01CO008,"['CHL1', 'AC090044.1', 'CNTN6', 'CNTN4', 'CNTN...",196763.0,198222513.0,1230,0.414999,198295559.0,0.998639
80851,9,co,01CO008,"['DDX11L5', 'FAM138C', 'FOXD4', 'CBWD1', 'DOCK...",12134.0,138253217.0,853,0.390957,138394717.0,0.998890
38579,18,co,01CO015,"['ROCK1P1', 'MIR8078', 'USP14', 'THOC1', 'COLE...",109065.0,80247514.0,310,-0.530210,80373285.0,0.997078
49965,20,co,01CO015,"['DEFB125', 'DEFB126', 'DEFB127', 'DEFB128', '...",87250.0,64313132.0,614,2.507757,64444167.0,0.996613
39738,18,ov,01OV010,"['USP14', 'THOC1', 'CLUL1', 'TYMS', 'ENOSF1', ...",158383.0,80247514.0,236,-0.502952,80373285.0,0.996465
51286,20,ov,01OV010,"['DEFB125', 'DEFB126', 'DEFB127', 'DEFB128', '...",87250.0,64287821.0,495,0.325728,64444167.0,0.996220
63571,4,ov,01OV013,"['ZNF721', 'PIGG', 'MYL5', 'PCGF3', 'CPLX1', '...",425815.0,188147743.0,636,-0.259557,190214555.0,0.986896
51309,20,ov,01OV026,"['DEFB125', 'DEFB126', 'DEFB127', 'DEFB128', '...",87250.0,64287821.0,495,0.703246,64444167.0,0.996220
39756,18,ov,01OV030,"['USP14', 'THOC1', 'CLUL1', 'TYMS', 'ENOSF1', ...",158383.0,80247514.0,236,-0.266765,80373285.0,0.996465


In [9]:
cancers = summary.groupby("cancer_type")[["start"]].agg(len).reset_index()
cancers["cancer_type"] = cancers["cancer_type"].\
    replace({
        "br": "BRCA",
        "cc": "CCRCC",
        "co": "Colon",
        "en": "Endometrial",
        "gb": "GBM",
        "hn": "HNSCC",
        "ls": "LSCC",
        "lu": "LUAD",
        "ov": "Ovarian"
    })

In [10]:
alt.Chart(cancers).mark_bar().encode(
    x=alt.X(
        "cancer_type",
        title="Cancer type"
    ),
    y=alt.Y(
        "start",
        title="Number of CNA events"
    )
).properties(
    title="Events per cancer type"
)

In [11]:
chrs = summary.groupby("chromosome")[["start"]].agg(len).reset_index()

# What are the weird chromosomes?

In [12]:
chrs = chrs[~chrs["chromosome"].isin(["GL000220.1", "KI270733.1"])]

In [13]:
chrs_sort = sorted([int(x) for x in chrs["chromosome"].values if x.isdigit()]) + ["X", "Y"]
chrs_sort = [str(x) for x in chrs_sort]

In [14]:
alt.Chart(chrs).mark_bar().encode(
    x=alt.X(
        "chromosome",
        title="Chromosome",
        sort=chrs_sort
    ),
    y=alt.Y(
        "start",
        title="Number of CNA events"
    )
).properties(
    title="Events per chromosome across all cancer types"
)