In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
from matplotlib import pylab
import seaborn as sns
import cptac

In [2]:
'''
@Param cluster_df:
    The dataframe containing hotspot cluster information for the desired cancer type
            - gathered as a csv from the supercomputer hotspot output
            - imported into the desired notebook as a dataframe using pd.read_csv

@Param mut_df:
    The dataframe containing the mutations that are present in the corresponding cancer cohort
            - gathered from cptac.cancer_type.get_mutations()
            
This function will take two dataframes as parameters to parse the hotspot. It uses the cluster dataframe
in conjuction with the mutations dataframe to determine, for each patient, if they contain mutations within 
the identified hotspots. It will return two dataframes that can be used to further explore the effects of 
hotspot mutations. It will return a small dataframe containing two columns: 'Hotspot_ID' and 'Patients_Within.' 
This will allow a quick user visualiztion of which hotspots would be of value to pursue. The second dataframe 
is larger and contains a 'Patient_ID' column as well as a column for each identified hotspot. It is populated 
with 'No', 'Yes', or 'Yes_HS' values to denote the patient's relationship with every hotspot.
'''




def identify_hotspots(cluster_df, mut_df):
    check = 'no'
    # Ensuring that the cluster_df has been properly imported
    if isinstance(cluster_df, pd.DataFrame):
        check = 'yes'
    
    if check == 'no':
        print ('Incorrect parameter! Please pass through a dataframe.')
        return None
    
    check = 'no'
    
    column_names = cluster_df.columns.tolist()
    if 'Cluster' in column_names:
        check = 'yes'
        
    if check == 'no':
        print ('Incorrectly formatted dataframe! Dataframe must contain a Cluster column.')
        return None
    
    cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
    cluster_list = list()
    
    for value in cluster_list_initial:
        length = len(cluster_df[cluster_df['Cluster'] == value])
        if length >= 2:
            cluster_list.append(value)
        
    cluster_list.sort()

    if len(cluster_list) == 0:
        print('There are no hotspot clusters that contain more than one mutation.')
        return None

    gene_dict = {}
    mut_dict = {}
    
    for value in cluster_list:
        gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
        mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
        if str(value).endswith('0'):
            mut_dict[gene_dict[value]] = mut_list
        else:
            mut_dict[str(gene_dict[value]) + '_' + str(value)[-1]] = mut_list
   
    #Creating dataframe 1 of 2 that will be returned at the end of the function
    vis_hs_df = pd.DataFrame()
    vis_hs_df['Hotspot_ID'] = mut_dict.keys()
    
    #Creating dataframe 2 of 2 that will be returned at the end of the function
    bin_hs_df = pd.DataFrame()
    bin_hs_df['Sample_ID'] = mut_df.index.unique()
    
    for hs in mut_dict.keys():
        #Defaulting each patient to not in the hostpot
        bin_hs_df[hs] = 'No'
        #Creating the list of mutations in the hotspot
        hs_locations = mut_dict[hs]
        #Creating a list to be populated with Sample IDs of patients in the hotspot
        hs_patients = list()
        
        #Removes the '_#' that was added to the hotspot if multiple hotspots in the same gene'
        if hs[-2] == '_':
            hs_gene = hs[:-2]
        else:
            hs_gene = hs
            
        for row in mut_df.iterrows():
            #Creating variables containing the following: gene, mutation location, and patient ID
            info = list(row[1])
            gene = info[0]
            location = info[2]
            Sample_ID = row[0]
            
            if gene == hs_gene:
                #Checking if the location of the mutation is within the hotspot
                if location in hs_locations:
                    #if the location is in the hotspot, the patients gets labeled yes for mutation and _HS for hotspot
                    bin_hs_df.loc[bin_hs_df['Sample_ID'] == Sample_ID, hs] = 'Yes_HS'
                    hs_patients.append(Sample_ID)
                else:
                    #The mutation is on a gene that contains a hotspot, but it is not within the hotspot, so the patient only gets labeled yes
                    bin_hs_df.loc[bin_hs_df['Sample_ID'] == Sample_ID, hs] = 'Yes'
        
        hs_count = len(set(hs_patients))
        
        vis_hs_df.loc[vis_hs_df['Hotspot_ID'] == hs, 'Patients_Within'] = hs_count
        vis_hs_df.loc[vis_hs_df['Hotspot_ID'] == hs, 'Mutation_List'] = str(hs_locations)
        
    return(vis_hs_df, bin_hs_df)

Incorrect parameter! Please pass through a dataframe.


TypeError: cannot unpack non-iterable NoneType object