In [2]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
from matplotlib import pylab
import seaborn as sns
import cptac

In [3]:
cluster_df = pd.read_csv(r'C:\Users\joncj\OneDrive\Documents\Research\GitHub\ccrcc.somatic.consensus.gdc.umichigan.wu.112918.maf.3D_Proximity.pairwise.recurrence.l0.r10.clusters', sep = '\t', engine = 'python')

In [5]:
cluster_df.loc[cluster_df['Gene/Drug'] == 'VHL', 'Mutation/Gene']

20     p.G93V
21    p.H115Q
22    p.H115Y
23     p.L89P
24     p.N90I
25    p.S111I
26    p.S111N
27     p.S65L
28     p.W88R
29     p.W88S
30     p.Y98N
31    p.V155L
32    p.V155M
Name: Mutation/Gene, dtype: object

In [3]:
#Defining the dataframe that will be used to present the information gathered from the cluster dataframe
visualize_df =pd.DataFrame()

#Creating a list containing each unique hotspot cluster from the csv file
cluster_list_initial = (cluster_df.Cluster.unique()).tolist()
cluster_list = list()

#Identifying the clusters that contain more than one mutation in the hotpsot
for value in cluster_list_initial:
    length = len(cluster_df[cluster_df['Cluster'] == value])
    if length >= 2:
        #Adding the clusters with 2+ mutations to a new list of clusters
        cluster_list.append(value)

#These two lines are the first step in creating the dataframe that will present the gather information
#Sorting the list of clusters will allow the data to built accurately, and present it in an aesthetic manner
cluster_list.sort()
visualize_df['Cluster_ID'] = cluster_list

#Defining the four different dictionaries that will be built in the following For Loop, each one gather different information
cluster_len_dict = {}
cluster_chrom_dict = {}
cluster_gene_dict = {}
cluster_mut_dict = {}
comparison_dict = {}

#This For Loop iterates through each of the previously identified clusters that are going to be of interest
for value in cluster_list:
    #Each of these lines will create a dictionary to later be used when contructing the dataframe to visualize the desired information
    cluster_len_dict[value] = len(cluster_df[cluster_df['Cluster'] == value])
    cluster_chrom_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Chromosome'].values[0]
    cluster_gene_dict[value] = cluster_df.loc[cluster_df['Cluster'] == value, 'Gene/Drug'].values[0]
    
    #Within this loop, I am creating a dictionary that uses the cluster ID as the key, and the value as a list of the different mutations that make up the cluster
    mut_list = cluster_df[cluster_df['Cluster'] == value]['Mutation/Gene'].values.tolist()
    cluster_mut_dict[value] = mut_list
    visualize_df.loc[visualize_df['Cluster_ID'] == value, 'Mutation_List_Name'] = 'cluster_mut_dict[' + str(value) + ']'
    
#This is where I construct the different columns of the visualize dataframe with the corresponding parsed information    
visualize_df['Cluster_Size'] = cluster_len_dict.values()
visualize_df['Cluster_Chromosome'] = cluster_chrom_dict.values()
visualize_df['Cluster_Gene/Protein'] = cluster_gene_dict.values()

#This loop names each hotspot/cluster based on the gene that is mutated
#If a gene has more than one hotspot in it, then each subsequent hotspot is numbered based on the decimal values of the original cluster ID
for key in cluster_gene_dict.keys():
    key_string = str(key)
    if key_string.endswith('0'):
        comparison_dict[cluster_gene_dict[key]] = cluster_mut_dict[key]
    else:
        comparison_dict[str(cluster_gene_dict[key]) + '_' + key_string[-1]] = cluster_mut_dict[key]

### Downloading the proper data for the cancer of interest from the CPTAC repository

In [5]:
#In this case, the cancer of interst is renal, and is refered to a 'renalccrcc'
cptac.download(dataset='renalccrcc')

All files already downloaded and correct.


True

### Creating variables for each of the different data sets

In [6]:
renal = cptac.RenalCcrcc()
proteomics = renal.get_proteomics()
transcriptomics = renal.get_transcriptomics()
clinical = renal.get_clinical()
mutations = renal.get_mutations()

                                    

### Creating a dataframe that identifies patients with mutations within the previously found hotspots

In [7]:
#Making a new dataframe to visualize the hotspot patients in
vis_df = pd.DataFrame()

#Adding all of the patient ID's to be utilized later to merge dataframes
vis_df['Sample_ID'] = mutations.index.unique()

#Looping through each hotspot at a time identifying which patients are part of it
for key in comparison_dict.keys():
    #Defaulting each patient to not in the hostpot
    vis_df[key] = 'No'
    #Creating the list of mutations in the hotspot
    hotspot_locations = comparison_dict[key]
    
    #This statement will remove the last two characters '_(number)' that were added if there were multiple hotspots on a protein
    if key[-2] == '_':
        hotspot_gene = key[:-2]
    else:
        hotspot_gene = key
        
    #Looping through the mutations Dataframe 
    for row in mutations.iterrows():
        #Creating variables using information from the row 
        info = list(row[1])
        gene = info[0]
        location = info[2]
        Sample_ID = row[0]
        
        #Checking if the mutated gene is one with a hotspot
        if gene == hotspot_gene:
            #Checking if the location of the mutation is within the hotspot
            if location in hotspot_locations:
                #if the location is in the hotspot, the patients gets labeled yes for mutation and _HS for hotspot
                vis_df.loc[vis_df['Sample_ID'] == Sample_ID, key] = 'Yes_HS'
            else:
                #The mutation is on a gene that contains a hotspot, but it is not within the hotspot, so the patient only gets labeled yes
                vis_df.loc[vis_df['Sample_ID'] == Sample_ID, key] = 'Yes'


### Filling a dataframe for each hotspot cluster with patient's mutation location

In [21]:
ren = pd.read_csv('ccrcc.somatic.consensus.gdc.umichigan.wu.112918.maf', sep = '\t')

In [22]:
ren['HGVSp_Short'].head()

0        p.I176I
1        p.Q315K
2    p.Q364Tfs*5
3         p.S15F
4        p.D418N
Name: HGVSp_Short, dtype: object

In [23]:
ren['Transcript_ID'].head()

0    ENST00000377939
1    ENST00000332296
2    ENST00000373548
3    ENST00000309276
4    ENST00000538576
Name: Transcript_ID, dtype: object

In [1]:
maf_columns = ('Hugo_Symbol',
 'Entrez_Gene_Id',
 'Center',
 'NCBI_Build',
 'Chromosome',
 'Start_Position',
 'End_Position',
 'Strand',
 'Variant_Classification',
 'Variant_Type',
 'Reference_Allele',
 'Tumor_Seq_Allele1',
 'Tumor_Seq_Allele2',
 'dbSNP_RS',
 'dbSNP_Val_Status',
 'Tumor_Sample_Barcode',
 'Matched_Norm_Sample_Barcode',
 'Match_Norm_Seq_Allele1',
 'Match_Norm_Seq_Allele2',
 'Tumor_Validation_Allele1',
 'Tumor_Validation_Allele2',
 'Match_Norm_Validation_Allele1',
 'Match_Norm_Validation_Allele2',
 'Verification_Status',
 'Validation_Status',
 'Mutation_Status',
 'Sequencing_Phase',
 'Sequence_Source',
 'Validation_Method',
 'Score',
 'BAM_File',
 'Sequencer',
 'Tumor_Sample_UUID',
 'Matched_Norm_Sample_UUID',
 'HGVSc',
 'HGVSp',
 'HGVSp_Short',
 'Transcript_ID',
 'Exon_Number',
 't_depth',
 't_ref_count',
 't_alt_count',
 'n_depth',
 'n_ref_count',
 'n_alt_count',
 'callers',
 'all_effects',
 'Allele',
 'Gene',
 'Feature',
 'Feature_type',
 'Consequence',
 'cDNA_position',
 'CDS_position',
 'Protein_position',
 'Amino_acids',
 'Codons',
 'Existing_variation',
 'ALLELE_NUM',
 'DISTANCE',
 'STRAND_VEP',
 'SYMBOL',
 'SYMBOL_SOURCE',
 'HGNC_ID',
 'BIOTYPE',
 'CANONICAL',
 'CCDS',
 'ENSP',
 'SWISSPROT',
 'TREMBL',
 'UNIPARC',
 'RefSeq',
 'SIFT',
 'PolyPhen',
 'EXON',
 'INTRON',
 'DOMAINS',
 'GMAF',
 'AFR_MAF',
 'AMR_MAF',
 'ASN_MAF',
 'EAS_MAF',
 'EUR_MAF',
 'SAS_MAF',
 'AA_MAF',
 'EA_MAF',
 'CLIN_SIG',
 'SOMATIC',
 'PUBMED',
 'MOTIF_NAME',
 'MOTIF_POS',
 'HIGH_INF_POS',
 'MOTIF_SCORE_CHANGE',
 'IMPACT',
 'PICK',
 'VARIANT_CLASS',
 'TSL',
 'HGVS_OFFSET',
 'PHENO',
 'MINIMISED',
 'ExAC_AF',
 'ExAC_AF_AFR',
 'ExAC_AF_AMR',
 'ExAC_AF_EAS',
 'ExAC_AF_FIN',
 'ExAC_AF_NFE',
 'ExAC_AF_OTH',
 'ExAC_AF_SAS',
 'GENE_PHENO',
 'FILTER',
 'flanking_bps',
 'variant_id',
 'variant_qual',
 'ExAC_AF_Adj',
 'ExAC_AC_AN_Adj',
 'ExAC_AC_AN',
 'ExAC_AC_AN_AFR',
 'ExAC_AC_AN_AMR',
 'ExAC_AC_AN_EAS',
 'ExAC_AC_AN_FIN',
 'ExAC_AC_AN_NFE',
 'ExAC_AC_AN_OTH',
 'ExAC_AC_AN_SAS',
 'ExAC_FILTER')

final_colon = pd.Dataframe()

for header in maf_columns:
    final_colon[header]

In [18]:
colon = pd.read_csv('mutation.txt', sep = '\t')

In [20]:
list(colon.columns)

['Chr',
 'Start',
 'End',
 'Ref',
 'Alt',
 'Variant_Type',
 'Variant_Function',
 'Gene',
 'mRNA',
 'mRNA_Change',
 'Protein_Change',
 'AA_Ref',
 'AA_Pos',
 'AA_Var',
 'SampleID']

In [2]:
final_colon = pd.Dataframe()

for header in maf_columns:
    final_colon[header]

NameError: name 'pd' is not defined

In [28]:
colon

Unnamed: 0,Chr,Start,End,Ref,Alt,Variant_Type,Variant_Function,Gene,mRNA,mRNA_Change,Protein_Change,AA_Ref,AA_Pos,AA_Var,SampleID
0,19,58864357,58864357,A,G,nonsynonymous SNV,protein-altering,A1BG,NM_130786,T277C,C93R,C,93,R,05CO044
1,19,58864307,58864307,C,G,nonsynonymous SNV,protein-altering,A1BG,NM_130786,G327C,E109D,E,109,D,01CO005
2,19,58858758,58858758,C,T,nonsynonymous SNV,protein-altering,A1BG,NM_130786,G1441A,E481K,E,481,K,11CO070
3,19,58858859,58858859,C,T,nonsynonymous SNV,protein-altering,A1BG,NM_130786,G1340A,R447H,R,447,H,11CO059
4,19,58864354,58864354,G,A,nonsynonymous SNV,protein-altering,A1BG,NM_130786,C280T,R94C,R,94,C,05CO028
5,10,52595911,52595911,G,T,nonsynonymous SNV,protein-altering,A1CF,NM_014576,C527A,T176N,T,176,N,09CO013
6,10,52595911,52595911,G,T,nonsynonymous SNV,protein-altering,A1CF,NM_138932,C527A,T176N,T,176,N,09CO013
7,10,52595911,52595911,G,T,nonsynonymous SNV,protein-altering,A1CF,NM_001198818,C527A,T176N,T,176,N,09CO013
8,10,52595911,52595911,G,T,nonsynonymous SNV,protein-altering,A1CF,NM_138933,C551A,T184N,T,184,N,09CO013
9,10,52595911,52595911,G,T,nonsynonymous SNV,protein-altering,A1CF,NM_001198820,C551A,T184N,T,184,N,09CO013
