In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cptac
import binarization_functions as bf
import gseapy as gp
from gseapy.plot import barplot, heatmap, dotplot
import json
import requests
import random
import deva

In [2]:
cptac.download(dataset='Ovarian')

ov = cptac.Ovarian()
proteomics = ov.get_proteomics()
transcriptomics = ov.get_transcriptomics()
clinical = ov.get_clinical()

                                    

## Step 2: Determine what attributes you would like to A/B test. 
For this analysis, we will iteratively go through the various columns in the clinical dataset, to determine if any of them have trends within them for protein enrichment.

In [3]:
columns_to_explore = ['Participant_History_Malignancy', 
                      'Sample_Tumor_Normal']

#Create a subsetted copy of the original Clinical DataFrame
annotations = pd.DataFrame(clinical[columns_to_explore].copy())

## Step 3: Perform Outlier Analysis
Using the Blacksheep-outliers package, deva, we will create two tables: one of all the significant up-regulated genes per patient (which we will call outliers), and one of the q-values for significantly enriched genes based on our binarized clinical attributes (which we will call qvalues).

In [4]:
outliers, qvalues = deva.run_outliers(proteomics.transpose(), 
                                      annotations,
                                      up_or_down='up',
                                      aggregate=False)



In [5]:
outliers_down, qvalues_down = deva.run_outliers(proteomics.transpose(), 
                                                annotations, 
                                                up_or_down='down', 
                                                aggregate=False)



In [6]:
length = int(len(outliers.df.columns) / 2)
temp_outliers = outliers.df.transpose()
only_outliers_up = temp_outliers[:length]
outliers_up_dict = {}
for i in range(length):
    key = proteomics.index[i]
    my_filter = only_outliers_up.iloc[i] == 1.0
    value = list(only_outliers_up.iloc[i][my_filter].index.get_level_values('Name'))
    outliers_up_dict[key] = value

In [7]:
length = int(len(outliers_down.df.columns) / 2)
temp_outliers_down = outliers_down.df.transpose()
only_outliers_down = temp_outliers_down[:length]
outliers_down_dict = {}
for i in range(length):
    key = proteomics.index[i]
    my_filter = only_outliers_down.iloc[i] == 1.0
    value = list(only_outliers_down.iloc[i][my_filter].index.get_level_values('Name'))
    outliers_down_dict[key] = value

## Step 4: Visualize these enrichments

First, we will determine which clinical attributes have significant enrichments, and save them to a list for both up-regulated outliers, and down-regulated outliers.

In [30]:
(qvalues.df['fisherFDR_Sample_Tumor_Normal_Normal']) < 0.05

Name     Database_ID 
A1BG     NP_570602        True
ABCA8    NP_001275914     True
ABCC1    NP_004987       False
ABHD5    NP_057090        True
ABI3BP   NP_056244        True
                         ...  
WISP2    NP_003872        True
ZBTB47   NP_660149        True
ZC2HC1A  NP_057094       False
ZNF512   NP_115810       False
ZNF787   NP_001002836     True
Name: fisherFDR_Sample_Tumor_Normal_Normal, Length: 671, dtype: bool

In [31]:
#Drop Columns with less than 4 significant up-regulated enrichments
sig_cols_up = []
for col in qvalues.df.columns:
    sig_col = bf.significantEnrichments(qvalues.df, col)
    if sig_col is not None and len(sig_col) > 4:
        sig_cols_up.append(sig_col)
    else:
        results_up = qvalues.df.drop(col, axis=1)

In [32]:
results_up

NameError: name 'results_up' is not defined

In [15]:
#Drop Columns with less than 4 significant down-regulated enrichments
sig_cols_down = []
for col in qvalues_down.df.columns:
    sig_col = bf.significantEnrichments(qvalues_down.df, col)
    if sig_col is not None:
        sig_cols_down.append(sig_col)
    else:
        results_down = qvalues_down.df.drop(col, axis=1)

In [16]:
#Here we will link clinical attributes with significantly up-regulated genes
sig_genes_up = {}
for i, col in enumerate(sig_cols_up):
    list_of_genes = list(col.index.get_level_values('Name'))
    sig_genes_up[sig_cols_up[i].columns[0][:-9]] = list_of_genes   

In [17]:
#And then down-regulated genes
sig_genes_down = {}
for i, col in enumerate(sig_cols_down):
    list_of_genes = list(col.index.get_level_values('Name'))
    sig_genes_down[sig_cols_down[i].columns[0][:-9]] = list_of_genes

In [18]:
#Simple visualization
ax1 = plt.axes()
sns.heatmap(results_up, ax = ax1)
ax1.set_title('Up-Regulated Protein Enrichments for Renal Cancer Tumors')
plt.show()

  


NameError: name 'results_up' is not defined

In [None]:
#Deva package visualization
for col in qvalues.df.columns:
    deva.plot_heatmap(
    annotations=annotations,
    qvals=qvalues.df,
    col_of_interest=col,
    vis_table=outliers.frac_table)
    
    plt.show()
    plt.close()

In [None]:
ax2 = plt.axes()
sns.heatmap(results_down, ax = ax2)
ax2.set_title('Down-Regulated Protein Enrichments for Renal Cancer Tumors')
plt.show()

In [None]:
#Deva package visualization
for col in qvalues_down.df.columns:
    deva.plot_heatmap(
    annotations=annotations,
    qvals=qvalues_down.df,
    col_of_interest=col,
    vis_table=outliers_down.frac_table)
    
    plt.show()
    plt.close()

## Step 5: Perform Gene Set Enrichment Analysis

Part A: Up-regulated Genes

In [None]:
gsea_up = {}
for col in sig_cols_up:
    col_name = col.columns[0]
    col_name = "_".join(col_name.split(" "))
    print(col_name[:-9])
    gene_name_list = list(col.index.get_level_values('Name'))
    print('Gene List:', gene_name_list, '\n')
    enrichment = gp.enrichr(gene_list = gene_name_list, 
                            description=col_name, 
                            gene_sets='KEGG_2019_Human', 
                            outdir='test/renal')
    gsea_up[col_name] = enrichment
    barplot(enrichment.res2d, title=col_name, ofname=col_name)

Determine significant overlap for significantly up-regulated genes, first for patients with a history of cancer, adn then for patients with invasive carcinoma.

In [None]:
#Significant overlap for patients with a history of cancer
attribute = 'fisherFDR_history_of_cancer_Yes_P_values'
cancer_history_yes = gsea_up[attribute].res2d
sig_filter = cancer_history_yes['Adjusted P-value'] < 0.05
cancer_history_sig = cancer_history_yes[sig_filter]
cancer_history_sig

In [None]:
genes = cancer_history_sig['Genes']
terms = cancer_history_sig['Term']
cancer_history_dict = {}
for i, gene in enumerate(genes):
    cancer_history_dict[terms[i]] = gene.split(';')
cancer_history_dict

In [None]:
#Determine significant overlap for patients with invasive carcinoma
attribute = 'fisherFDR_margin_status_Margin(s)_involved_by_invasive_carcinoma_P_values'
invasive_cancer = gsea_up[attribute].res2d
sig_filter = invasive_cancer['Adjusted P-value'] < 0.05
invasive_sig = invasive_cancer[sig_filter]
invasive_sig

In [None]:
genes = invasive_sig['Genes']
terms = invasive_sig['Term']
invasive_dict = {}
for i, gene in enumerate(genes):
    invasive_dict[terms[i]] = gene.split(';')
invasive_dict

## Step 6: Connect Findings with Druggable Genome Interactive Database (DGIdb)

In [None]:
cancer_history_yes_requests = {}
cancer_history_requests_parsed = {}
for term, gene_list in cancer_history_dict.items():
    cancer_history_yes_requests[term] = bf.dgidb_get_request(gene_list)
    cancer_history_requests_parsed[term] = bf.dgidb_json_parse(cancer_history_yes_requests[term], genes=True)

print(json.dumps(cancer_history_requests_parsed, indent=4))

In [None]:
invasive_cancer_requests = {}
invasive_requests_parsed = {}
for term, gene_list in invasive_dict.items():
    invasive_cancer_requests[term] = bf.dgidb_get_request(gene_list)#, anti_neoplastic=True)
    invasive_requests_parsed[term] = bf.dgidb_json_parse(invasive_cancer_requests[term], genes=True)

print("Parsed Requests:\n")
print(json.dumps(invasive_requests_parsed, indent=4))

## Step 7: Personalized Medicine

In [None]:
# To perform a request specifically for inhibitors, you may opt to use a loop
# While this option is slower than the map, it is more specific, and may be worth
# waiting a few more seconds to reduce manual sifting through interaction types
inhibitors = {}
for attribute, genes in sig_genes_up.items():
    inhibitors[attribute] = bf.dgidb_get_request(genes, interaction_types=['inhibitor'])

In [None]:
print('UP-REGULATED INHIBITOR REQUEST:\n')
inhibitors_parsed = {}
for attribute, request in inhibitors.items():
    inhibitors_parsed[attribute] = bf.dgidb_json_parse(request, genes=True)
print(json.dumps(inhibitors_parsed, indent=4))

In [None]:
patient_drugs_genes_up = bf.compare_enrichments_with_drugs(outliers_up_dict, clinical)

In [None]:
random.seed(8)
patients_to_check_up = []
for i in range(3):
    index = random.randrange(0, len(only_outliers_up))
    while clinical.index[index] not in outliers_up_dict.keys() and index < len(outliers_up_dict.keys()):
        index += 1
    patients_to_check_up.append(clinical.index[index])

In [None]:
personalized_up = {}
for patient in patients_to_check_up:
    patients_up_dict = bf.dgidb_get_request(outliers_up_dict[patient], 
                                            interaction_types = ['inhibitor'])
    parsed = bf.dgidb_json_parse(patients_up_dict, genes=True)
    personalized_up[patient] = parsed
print(json.dumps(personalized_up, indent = 4))