In [2]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
from matplotlib import pylab
import seaborn as sns
import cptac

In [11]:
cluster_df = pd.read_csv(r'C:\Users\joncj\OneDrive\Documents\Research\GitHub\ccrcc.somatic.consensus.gdc.umichigan.wu.112918.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters', sep = '\t', engine = 'python')

In [4]:
#Defining the dataframe that will be used to present the information gathered from the cluster dataframe
visualize_df =pd.DataFrame()

#Creating a list containing each unique hotspot cluster from the csv file
cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
cluster_list = list()

#Identifying the clusters that contain more than one mutation in the hotpsot
for value in cluster_list_initial:
    length = len(cluster_df[cluster_df['Cluster'] == value])
    if length >= 2:
        #Adding the clusters with 2+ mutations to a new list of clusters
        cluster_list.append(value)

#These two lines are the first step in creating the dataframe that will present the gather information
#Sorting the list of clusters will allow the data to built accurately, and present it in an aesthetic manner
cluster_list.sort()
visualize_df['Cluster_ID'] = cluster_list

#Defining the four different dictionaries that will be built in the following For Loop, each one gather different information
cluster_len_dict = {}
cluster_chrom_dict = {}
cluster_gene_dict = {}
cluster_mut_dict = {}
comparison_dict = {}

#This For Loop iterates through each of the previously identified clusters that are going to be of interest
for value in cluster_list:
    #Each of these lines will create a dictionary to later be used when contructing the dataframe to visualize the desired information
    cluster_len_dict[value] = len(cluster_df[cluster_df['Cluster'] == value])
    cluster_chrom_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Chromosome'].values[0]
    cluster_gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
    
    #Within this loop, I am creating a dictionary that uses the cluster ID as the key, and the value as a list of the different mutations that make up the cluster
    mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
    cluster_mut_dict[value] = mut_list
    visualize_df.loc[visualize_df['Cluster_ID'] == value, 'Mutation_List_Name'] = 'cluster_mut_dict[' + str(value) + ']'
    
#This is where I construct the different columns of the visualize dataframe with the corresponding parsed information    
visualize_df['Cluster_Size'] = cluster_len_dict.values()
visualize_df['Cluster_Chromosome'] = cluster_chrom_dict.values()
visualize_df['Cluster_Gene/Protein'] = cluster_gene_dict.values()

#This loop names each hotspot/cluster based on the gene that is mutated
#If a gene has more than one hotspot in it, then each subsequent hotspot is numbered based on the decimal values of the original cluster ID
for key in cluster_gene_dict.keys():
    key_string = str(key)
    if key_string.endswith('0'):
        comparison_dict[cluster_gene_dict[key]] = cluster_mut_dict[key]
    else:
        comparison_dict[str(cluster_gene_dict[key]) + '_' + key_string[-1]] = cluster_mut_dict[key]

In [5]:
visualize_df

Unnamed: 0,Cluster_ID,Mutation_List_Name,Cluster_Size,Cluster_Chromosome,Cluster_Gene/Protein
0,0.0,cluster_mut_dict[0.0],2,chr1,AMY2A
1,3.0,cluster_mut_dict[3.0],2,chr2,DUSP2
2,10.0,cluster_mut_dict[10.0],2,chr16,KIF22
3,13.0,cluster_mut_dict[13.0],2,chr3,PBRM1
4,15.0,cluster_mut_dict[15.0],3,chr3,PIK3CA
5,17.0,cluster_mut_dict[17.0],2,chr18,SMAD2
6,20.0,cluster_mut_dict[20.0],11,chr3,VHL
7,20.1,cluster_mut_dict[20.1],2,chr3,VHL


### Downloading the proper data for the cancer of interest from the CPTAC repository

In [6]:
#In this case, the cancer of interst is renal, and is refered to a 'renalccrcc'
cptac.download(dataset='renalccrcc')

All files already downloaded and correct.


True

### Creating variables for each of the different data sets

In [7]:
renal = cptac.RenalCcrcc()
proteomics = renal.get_proteomics()
transcriptomics = renal.get_transcriptomics()
clinical = renal.get_clinical()
mutations = renal.get_mutations()

                                    

### Creating a dataframe that identifies patients with mutations within the previously found hotspots

In [8]:
#Making a new dataframe to visualize the hotspot patients in
vis_df = pd.DataFrame()

#Adding all of the patient ID's to be utilized later to merge dataframes
vis_df['Sample_ID'] = mutations.index.unique()

#Looping through each hotspot at a time identifying which patients are part of it
for key in comparison_dict.keys():
    #Defaulting each patient to not in the hostpot
    vis_df[key] = 'No'
    #Creating the list of mutations in the hotspot
    hotspot_locations = comparison_dict[key]
    
    #This statement will remove the last two characters '_(number)' that were added if there were multiple hotspots on a protein
    if key[-2] == '_':
        hotspot_gene = key[:-2]
    else:
        hotspot_gene = key
        
    #Looping through the mutations Dataframe 
    for row in mutations.iterrows():
        #Creating variables using information from the row 
        info = list(row[1])
        gene = info[0]
        location = info[2]
        Sample_ID = row[0]
        
        #Checking if the mutated gene is one with a hotspot
        if gene == hotspot_gene:
            #Checking if the location of the mutation is within the hotspot
            if location in hotspot_locations:
                #if the location is in the hotspot, the patients gets labeled yes for mutation and _HS for hotspot
                vis_df.loc[vis_df['Sample_ID'] == Sample_ID, key] = 'Yes_HS'
            else:
                #The mutation is on a gene that contains a hotspot, but it is not within the hotspot, so the patient only gets labeled yes
                vis_df.loc[vis_df['Sample_ID'] == Sample_ID, key] = 'Yes'


### Filling a dataframe for each hotspot cluster with patient's mutation location

In [9]:
vis_df

Unnamed: 0,Sample_ID,AMY2A,DUSP2,KIF22,PBRM1,PIK3CA,SMAD2,VHL,VHL_1
0,S001,No,No,No,Yes,No,No,Yes,Yes
1,S002,No,No,No,No,No,No,Yes,Yes
2,S003,No,No,No,No,No,No,Yes,Yes_HS
3,S004,No,No,No,No,No,No,Yes,Yes
4,S005,No,No,No,Yes,No,No,Yes,Yes
5,S006,No,No,No,No,No,No,Yes_HS,Yes
6,S007,No,No,No,No,No,No,Yes,Yes
7,S008,No,No,No,No,No,No,Yes,Yes
8,S009,No,No,No,No,No,No,Yes,Yes
9,S010,No,No,No,Yes,No,No,No,No
