# SCNA analysis step 1: Mark CNA values

- Input: cptac CNV tables
- Output: The SCNA table--one long table of all the CNV values that pass the cutoff
- Steps:
    1. Convert dataframes to long format (i.e., columns are [Patient_ID, gene, cna_val], so one row for each gene for each sample)
    2. Add a column indicating cancer type
    3. Append them all into one long table
    4. Add a column called "pass", defined by:
        - `np.where(abs(cna_val) > cutoff, True, False)`
        - Meaning, if the absolute value is above the cutoff, mark True
        - Based on the distribution plots, I think either 0.1 or 0.2 would be a reasonable cutoff.

In [1]:
import cptac
import pandas as pd
import numpy as np
import datetime
import os

TIME_START = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
CUTOFF = 0.2

STEP1_DIR = "01_outputs"
if not os.path.isdir(STEP1_DIR):
    os.mkdir(STEP1_DIR)
    
FILE_PATH = os.path.join(STEP1_DIR, f"scna_cutoff_{CUTOFF}_{TIME_START}.tsv.gz")

In [8]:
print(FILE_PATH)

01_outputs/scna_cutoff_0.2_20200706_092210.tsv.gz


In [2]:
dss = {
    "br": cptac.Brca,
    "cc": cptac.Ccrcc,
    "co": cptac.Colon,
    "en": cptac.Endometrial,
    "gb": cptac.Gbm,
    "hn": cptac.Hnscc,
    "ls": cptac.Lscc,
    "lu": cptac.Luad,
    "ov": cptac.Ovarian
}

In [3]:
def load(name, datasets_dict):
    cna = dss[name]().\
        get_CNV().\
        reset_index().\
        melt(id_vars="Patient_ID", value_name="cna_val").\
        rename(columns={"Name": "gene"}).\
        assign(cancer_type=name)
    
    if "Database_ID" in cna.columns:
        # Parse the database IDs to remove version numbers from Ensembl IDs, as they interfere with lookup.
        cna = cna.assign(
            Database_ID=cna["Database_ID"].str.rsplit(".", n=1, expand=True)[0]
        )
    else:
        cna.insert(2, "Database_ID", np.nan)
    
    return cna

In [4]:
cna = pd.DataFrame()

for name in dss.keys():
    cna = cna.append(load(name, dss))

                                                



                                          



                                         



                                            

In [5]:
cna = cna.assign(passes=np.where(abs(cna["cna_val"]) > CUTOFF, True, False))

In [6]:
cna.to_csv(FILE_PATH, index=False, compression="gzip", sep="\t")

In [7]:
cna

Unnamed: 0,Patient_ID,gene,Database_ID,cna_val,cancer_type,passes
0,CPT000814,7SK,ENSG00000232512,-0.0580,br,False
1,CPT001846,7SK,ENSG00000232512,0.0650,br,False
2,X01BR001,7SK,ENSG00000232512,1.0360,br,True
3,X01BR008,7SK,ENSG00000232512,0.0900,br,False
4,X01BR009,7SK,ENSG00000232512,0.3750,br,True
5,X01BR010,7SK,ENSG00000232512,0.2110,br,True
6,X01BR015,7SK,ENSG00000232512,-0.0860,br,False
7,X01BR017,7SK,ENSG00000232512,0.1920,br,False
8,X01BR018,7SK,ENSG00000232512,-0.0380,br,False
9,X01BR020,7SK,ENSG00000232512,1.1170,br,True
