In [7]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
from matplotlib import pylab
import seaborn as sns
import cptac
import collections

In [8]:
cluster_df = pd.read_csv(r'C:\Users\joncj\OneDrive\Documents\Research\GitHub\ccrcc.somatic.consensus.gdc.umichigan.wu.112918.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters', sep = '\t')
colon_clusters = pd.read_csv('DanHeader_MyData.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters', sep = '\t')

In [9]:
cptac.download(dataset='Colon')
colon = cptac.Colon()
proteomics = colon.get_proteomics()
transcriptomics = colon.get_transcriptomics()
clinical = colon.get_clinical()
mutations = colon.get_mutations()

All files already downloaded and correct.
                                    

In [11]:
def parse_hotspot_1(cluster_df, mut_df):
    '''
    @Param cluster_df:
        The dataframe containing hotspot cluster information for the desired cancer type
                - gathered as a csv from the supercomputer hotspot output
                - imported into the desired notebook as a dataframe using pd.read_csv()

    @Param mut_df:
        The dataframe containing the mutations that are present in the corresponding cancer cohort
                - gathered from cptac.cancer_type.get_mutations()

    @Return:


    This function will take two dataframes as parameters to parse the hotspot. It uses the cluster dataframe
    in conjuction with the mutations dataframe to determine, for each patient, if they contain mutations within 
    the identified hotspots. It will return two dataframes and a dictionary that can be used to further explore the 
    effects of hotspot mutations. It will return a small dataframe containing two columns: 'Hotspot_ID' and 
    'Patients_Within.' This will allow a quick user visualiztion of which hotspots would be of value to pursue. 
    The second dataframe is larger and contains a 'Patient_ID' column as well as a column for each identified hotspot. 
    It is populated with 'No', 'Yes', or 'Yes_HS' values to denote the patient's relationship with every hotspot.
    '''
    check = 'no'
    # Ensuring that the cluster_df has been properly imported
    if isinstance(cluster_df, pd.DataFrame):
        check = 'yes'
    
    if check == 'no':
        print ('Incorrect parameter! Please pass through a dataframe.')
        return None
    
    check = 'no'
    
    column_names = cluster_df.columns.tolist()
    if 'Cluster' in column_names:
        check = 'yes'
        
    if check == 'no':
        print ('Incorrectly formatted dataframe! Dataframe must contain a Cluster column.')
        return None
    
    cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
    cluster_list = list()
    
    for value in cluster_list_initial:
        length = len(cluster_df[cluster_df['Cluster'] == value])
        if length >= 2:
            cluster_list.append(value)
        
    cluster_list.sort()

    if len(cluster_list) == 0:
        print('There are no hotspot clusters that contain more than one mutation.')
        return None

    gene_dict = {}
    mut_dict = {}
    
    for value in cluster_list:
        gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
        mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
        if str(value).endswith('0'):
            mut_dict[gene_dict[value]] = mut_list
        else:
            mut_dict[str(gene_dict[value]) + '_' + str(value)[-1]] = mut_list
   
    #Creating dataframe 1 of 2 that will be returned at the end of the function
    vis_hs_df = pd.DataFrame()
    vis_hs_df['Hotspot_ID'] = mut_dict.keys()
    
    #Creating dataframe 2 of 2 that will be returned at the end of the function
    bin_hs_df = pd.DataFrame()
    bin_hs_df['Sample_ID'] = mut_df.index.unique()
    
    for hs in mut_dict.keys():
        #Defaulting each patient to not in the hostpot
        bin_hs_df[hs] = 'No'
        #Creating the list of mutations in the hotspot
        hs_locations = mut_dict[hs]
        #Creating a list to be populated with Sample IDs of patients in the hotspot
        hs_patients = list()
        
        #Removes the '_#' that was added to the hotspot if multiple hotspots in the same gene'
        if hs[-2] == '_':
            hs_gene = hs[:-2]
        else:
            hs_gene = hs
            
        for row in mut_df.iterrows():
            #Creating variables containing the following: gene, mutation location, and patient ID
            info = list(row[1])
            gene = info[0]
            location = info[2]
            Sample_ID = row[0]
            
            if gene == hs_gene:
                #Checking if the location of the mutation is within the hotspot
                if location in hs_locations:
                    #if the location is in the hotspot, the patients gets labeled yes for mutation and _HS for hotspot
                    bin_hs_df.loc[bin_hs_df['Sample_ID'] == Sample_ID, hs] = 'Yes_HS'
                    hs_patients.append(Sample_ID)
                else:
                    #The mutation is on a gene that contains a hotspot, but it is not within the hotspot, so the patient only gets labeled yes
                    bin_hs_df.loc[bin_hs_df['Sample_ID'] == Sample_ID, hs] = 'Yes'
        
        hs_count = len(set(hs_patients))
        
        vis_hs_df.loc[vis_hs_df['Hotspot_ID'] == hs, 'Patients_Within'] = hs_count
        
    return(vis_hs_df, bin_hs_df, mut_dict)

In [56]:
def parse_hotspot(path, cancer_type):
    '''
    @Param path:
        (String) The path to the cluster output file that is on your computer after running the Hotspot analysis
    
    @Param cancer_type:
        (String) The cancer type that the hotspot analysis was run on. Please use one of the following: 'Ovarian', 'Endometrial', 'Colon', or 'Renal'.
        
    @Return:
        There will be four outputs for this function:
        
        vis_hs_df: 
            visualize hotspot dataframe
            
            A small dataframe which will allow quick visualization regarding the number of cancer patients that contain hotspot mutations
        
        bin_hs_df: 
            binary hotspot dataframe
            
            A larger dataframe that contains boolean values for each patient and their relationship with the hotspot(True = patient has a hotspot mutation, False = patient does not have a hotspot mutation)
        
        det_hs_df: 
            detailed hotspot dataframe
            
            A larger dataframe that contains nonbinary values for each patient and their relationship with the hotspot(No = no mutation, Yes = mutation but not in the hotspot, Yes_HS = mutation in the hotspot)
        
        mut_dict:
            mutations dictionary
            
            A dictionary that contains the hotspot gene as the key, and a list of mutations that make up that hotspot
            
    This function will take two parameters (cluster file path and cancer type) and use them to parse the Hotspot3D program output. It creates a cluster dataframe from the Hotspot3D output, and identifies the patients who contain hotspot mutations. The outputs of this function can be used to run further statistical analysis and exploration on the cancer datasets.
    '''
    #Importing the desired cluster file from the specified path on the computer
    cluster_df = pd.read_csv(path, sep = '\t')
    
    #Importing the desired cancer type mutation dataframe from cptac
    cptac.download(dataset=cancer_type)
    
    #Checking what cancer_type variable was passed into the function to get the corresponding cancer data. 
    if cancer_type == 'Colon':
        cancer = cptac.Colon()
    if cancer_type == 'Renal':
        cancer = cptac.RenalCcrcc()
    if cancer_type == 'Endometrial':
        cancer = cptac.Endometrial()
    if cancer_type == 'Ovarian':
        cancer = cptac.Ovarian()

    #Importing the somatic mutations data for the correct cancer
    mut_df = cancer.get_mutations()
    
    #Creating a list of all the identified hotspot clusters
    cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
    cluster_list = list()
    
    #Checking each cluster to make sure that only clusters containing 2 or more mutations are looked at ('clusters' with only 1 mutation are technically just frequently mutated)
    for value in cluster_list_initial:
        length = len(cluster_df[cluster_df['Cluster'] == value])
        if length >= 2:
            cluster_list.append(value)
    
    #Sorting the list numerically
    cluster_list.sort()
    
    #If there are no clusters that have more than one mutation, the function ends and returns the statement below
    if len(cluster_list) == 0:
        print('There are no hotspot clusters that contain more than one mutation.')
        return None
    
    #creating the multiple dictionaries that are used to compile hotspots and corresponding mutations
    gene_dict = {}
    mut_dict = {}
    rev_mut_dict = {}
    hs_count = {}
    
    #This loop contructs a reverse dictionary to be used to classify patients' mutations as well as the mutation dictionary output
    for value in cluster_list:
        gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
        mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
        if str(value).endswith('0'):
            mut_dict[gene_dict[value]] = mut_list
            hs_count[gene_dict[value]] = 0
        else:
            mut_dict[str(gene_dict[value]) + '_' + str(value)[-1]] = mut_list
            hs_count[str(gene_dict[value]) + '_' + str(value)[-1]] = 0
    
    #This loop finalizes the reverse dictionary
    for hs in mut_dict.keys():
        for mutation in mut_dict[hs]:
            rev_mut_dict[mutation] = hs
    
    #The three dataframe outputs are initialized
    vis_hs_df = pd.DataFrame()
    vis_hs_df['hotspot_id'] = mut_dict.keys()
    
    bin_hs_df = pd.DataFrame()
    bin_hs_df['sample_id'] = mut_df.index.unique()
    
    det_hs_df = pd.DataFrame()
    det_hs_df['sample_id'] = mut_df.index.unique()
    
    #This loop populates default values for each patient and hotspot
    for hs in mut_dict.keys():
        bin_hs_df[hs] = False
        det_hs_df[hs] = 'No'
    
    #This loop iterates through each individual mutation and then properly identifies the mutation in the different dataframes
    for row in mut_df.iterrows():
        info = list(row[1])
        gene = info[0]
        location = info[2]
        location = 'p.'+str(location)
        sample_id = row[0]
        
        #This statement checks to see if the mutation is one of the hotspot mutations
        if location in rev_mut_dict.keys():
            hs = rev_mut_dict[location]
            hs_count[hs] += 1
            
            bin_hs_df.loc[bin_hs_df['sample_id'] == sample_id, hs] = True
            det_hs_df.loc[det_hs_df['sample_id'] == sample_id, hs] = 'Yes_HS'
        
        #This statement is used if the mutation is not a hotspot mutation, but if it still on one of the proteins that contains a hotspot
        elif gene in mut_dict.keys():
            det_hs_df.loc[det_hs_df['sample_id'] == sample_id, hs] = 'Yes'
    
    #This loop adds the patient count for each hotspot to the small visualize hotspot dataframe
    for hs in hs_count.keys():
        vis_hs_df.loc[vis_hs_df['hotspot_id'] == hs, 'patients_within'] = hs_count[hs]
    
    #Return of the three dataframes and mutation dictionary
    return (vis_hs_df, bin_hs_df, det_hs_df, mut_dict)
    

In [54]:
(vis_hs_df, bin_hs_df, det_hs_df, mut_dict) = parse_hotspot_2('DanHeader_MyData.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters', 'Colon')

All files already downloaded and correct.
                                    

In [None]:
cancer_type = 'Colon'
path = 'DanHeader_MyData.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters'
#Importing the desired cluster file from the specified path on the computer
cluster_df = pd.read_csv(path, sep = '\t')

#Importing the desired cancer type mutation dataframe from the cptac
cptac.download(dataset=cancer_type)
if cancer_type == 'Colon':
    cancer = cptac.Colon()
mut_df = cancer.get_mutations()

In [38]:


cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
cluster_list = list()

for value in cluster_list_initial:
    length = len(cluster_df[cluster_df['Cluster'] == value])
    if length >= 2:
        cluster_list.append(value)

cluster_list.sort()

if len(cluster_list) == 0:
    print('There are no hotspot clusters that contain more than one mutation.')
    #return None

#creating the two dictionaries to be used
gene_dict = {}
mut_dict = {}
rev_mut_dict = {}
hs_count = {}

for value in cluster_list:
    gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
    mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
    if str(value).endswith('0'):
        mut_dict[gene_dict[value]] = mut_list
        hs_count[gene_dict[value]] = 0
    else:
        mut_dict[str(gene_dict[value]) + '_' + str(value)[-1]] = mut_list
        hs_count[str(gene_dict[value]) + '_' + str(value)[-1]] = 0

for hs in mut_dict.keys():
    for mutation in mut_dict[hs]:
        rev_mut_dict[mutation] = hs

vis_hs_df = pd.DataFrame()
vis_hs_df['hotspot_id'] = mut_dict.keys()

bin_hs_df = pd.DataFrame()
bin_hs_df['sample_id'] = mut_df.index.unique()

det_hs_df = pd.DataFrame()
det_hs_df['sample_id'] = mut_df.index.unique()

for hs in mut_dict.keys():
    bin_hs_df[hs] = False
    det_hs_df[hs] = 'No'

for row in mut_df.iterrows():
    info = list(row[1])
    gene = info[0]
    location = info[2]
    location = 'p.'+str(location)
    sample_id = row[0]

    if location in rev_mut_dict.keys():
        hs = rev_mut_dict[location]
        hs_count[hs] += 1

        bin_hs_df.loc[bin_hs_df['sample_id'] == sample_id, hs] = True
        det_hs_df.loc[det_hs_df['sample_id'] == sample_id, hs] = 'Yes_HS'

    elif gene in mut_dict.keys():
        det_hs_df.loc[det_hs_df['sample_id'] == sample_id, hs] = 'Yes'

for hs in hs_count.keys():
    vis_hs_df.loc[vis_hs_df['hotspot_id'] == hs, 'patients_within'] = hs_count[hs]

In [43]:
vis_hs_df

Unnamed: 0,hotspot_id,patients_within
0,A2M,6.0
1,ABCB6,6.0
2,ACAA1,11.0
3,ACACB,2.0
4,ACADSB,5.0
5,ACAP1,4.0
6,ACBD3,3.0
7,ACLY,6.0
8,ACP5,20.0
9,ACVRL1,19.0


In [45]:
vis_hs_df.loc[vis_hs_df['patients_within'] > 10]

Unnamed: 0,hotspot_id,patients_within
2,ACAA1,11.0
8,ACP5,20.0
9,ACVRL1,19.0
10,ADAMTS5,13.0
13,ADIPOR2,11.0
14,AGO1,16.0
20,APC,18.0
25,ARHGEF11,11.0
27,ARSA,17.0
31,BCORL1,12.0
