# Enrichment NB 03: Look for pathways enriched with proteins that are only shown differentially expressed in one group
- Load saved table
- For each cancer type:
    - Separately select proteins that only have a significant difference in samples with the event, and proteins that only have a significant difference in samples without the event.
    - Also select proteins that are significantly upregulated in samples with the event but downregulated in samples without the event, and vice versa
    - For each group, run an enrichment analysis
        - Which algorithm?
        - Which gene set library? Probably either GO or Reactome.
        - Rank by difference in expression (should we take absolute value? Check)

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
ALPHA = 0.05

diff_expr_file = os.path.join("diff_expr", "all_results.tsv.gz")

groups_dir = "protein_groups"
groups_file = os.path.join(groups_dir, "groups_proteins.tsv.gz")

if not os.path.isdir(groups_dir):
    os.mkdir(groups_dir)

## Load expression data and filter values below the cutoff

In [3]:
diff_expr = pd.read_csv(diff_expr_file, sep="\t", dtype={"p_val": float, "adj_p": float})
diff_expr = diff_expr[diff_expr["adj_p"] <= ALPHA]

## Find groups of proteins to look for enrichment in

In [4]:
groups = pd.DataFrame()

for cancer_type in diff_expr["cancer_type"].unique():
    
    # Get data for this cancer type
    cancer_df = diff_expr[diff_expr["cancer_type"] == cancer_type]
    
    # Separate groups
    with_gain = cancer_df[cancer_df["group"] == "with_gain"]
    without_gain = cancer_df[cancer_df["group"] == "without_gain"]
    
    with_loss = cancer_df[cancer_df["group"] == "with_loss"]
    without_loss = cancer_df[cancer_df["group"] == "without_loss"]
    
    with_both = cancer_df[cancer_df["group"] == "with_both"]
    without_both = cancer_df[cancer_df["group"] == "without_both"]
    
    
    
    # Identify proteins that are up in samples with the gain, but not in samples without the gain
    only_with_gain = with_gain[~with_gain["protein"].isin(without_gain["protein"])]
    groups = groups.append(only_with_gain)
    
    # Identify proteins that are up in samples without the gain, but not in samples with the gain
    only_without_gain = without_gain[~without_gain["protein"].isin(with_gain["protein"])]
    groups = groups.append(only_without_gain)
    
    
    
    # Identify proteins that are up in samples with the loss, but not in samples without the loss
    only_with_loss = with_loss[~with_loss["protein"].isin(without_loss["protein"])]
    groups = groups.append(only_with_loss)

    # Identify proteins that are up in samples without the loss, but not in samples with the loss
    only_without_loss = without_loss[~without_loss["protein"].isin(with_loss["protein"])]
    groups = groups.append(only_without_loss)

    
    
    # Identify proteins that are up in samples with the gain and the loss, but not in samples 
    # without the gain and the loss
    only_with_both = with_both[~with_both["protein"].isin(without_both["protein"])]
    groups = groups.append(only_with_both)

    # Identify proteins that are up in samples without the gain and the loss, but not in samples 
    # with the gain and the loss
    only_without_both = without_both[~without_both["protein"].isin(with_both["protein"])]
    groups = groups.append(only_without_both)

## Save results

In [5]:
groups.insert(0, "cancer_type_group", groups["cancer_type"] + "_" + groups["group"])

groups = groups.drop(columns=["cancer_type", "group"])

In [6]:
groups.to_csv(groups_file, sep="\t", index=False, compression="gzip")