# Enrichment NB 1: Group samples into those with the arm level event and those without it

1. Load proteomics tables
2. Read in the has_event tables
3. Join into the proteomics tables
4. Save tables

## Setup

In [1]:
import cptac
import numpy as np
import pandas as pd
import os

In [2]:
events_dir = ".."

proteomics_dir = "proteomics_tables"

if not os.path.isdir(proteomics_dir):
    os.mkdir(proteomics_dir)

In [3]:
# Create a dictionary of the datasets
# We don't load them yet--we'll do it one at a time to save RAM
dss = {
#     "brca": cptac.Brca, # No normal samples
#    "ccrcc": cptac.Ccrcc, # No event table
    "colon": cptac.Colon,
#    "endometrial": cptac.Endometrial,  # No event table
#    "gbm": cptac.Gbm,  # No event table
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian,
}

## Load and join tables for each cancer type

In [4]:
# We don't load the dataset until we're inside the function, so that it
# will pass out of scope when the function ends and be garbage collected,
# which will save RAM.

def load_and_join_prot(dataset, event_table_dir, output_dir):
    
    ds = dataset(no_internet=True)
    prot = ds.get_proteomics()
    
    event_table_path = os.path.join(event_table_dir, f"{ds.get_cancer_type()}_has_event.tsv")
    event_table = pd.read_csv(event_table_path, sep="\t", index_col=0)
    
    # Add a null index level to the event data if the protein table has two levels, for joining
    if prot.columns.nlevels == 2:
        event_table.columns.name = "Name"
        event_table = event_table.transpose().reset_index(drop=False)
        event_table.insert(1, "Database_ID", [np.nan, np.nan])
        event_table = event_table.\
            set_index(["Name", "Database_ID"]).\
            transpose()
    
    joined = prot.join(other=event_table, how="left")
    
    joined_file_path = os.path.join(output_dir, f"{ds.get_cancer_type()}_prot_event.tsv.gz")
    joined.to_csv(joined_file_path, sep="\t", compression="gzip")

In [5]:
for dataset_func in dss.values():
    load_and_join_prot(
        dataset=dataset_func, 
        event_table_dir=events_dir, 
        output_dir=proteomics_dir
    )

                               



                            



                               