#Create Training Feature Vector

This script takes data from the cosmic database about cancer mutations and creates feature vectors on the mutations that will be used for both training and validation.

In [77]:
%pylab inline
import pandas as pd
import numpy as np
import os
import sklearn as sk
from sklearn.cluster import AffinityPropagation
from sklearn import datasets
from sklearn import metrics
from multiprocessing import Pool
import pickle

Populating the interactive namespace from numpy and matplotlib


##Import and clean up whole data set - Use COSMIC

Helper methods.

In [78]:
def simplify(x):
    a = x.split("_")
    return a[0]
def get_loc(x):
    c = x.split(":")
    s = c[1].split("-")
    return s[0]
def combine(x):
    gene = x[0]
    loc = x[1]
    return str(int(gene)) + "_" + str(loc)

Import cosmic data (excluding tcga mutations). Perform merge to add entrez ids. This may be unnecessary?

In [79]:
cosmic_all = pd.read_csv(open('../raw_data/THYROIDonly_noTCGA_uniqMutationID.csv', 'r'))
cosmic_all['Gene name'] = cosmic_all['Gene name'].map(simplify)
cosmic_all['Start_Position'] = cosmic_all['Mutation genome position'].map(get_loc)
#cosmic_all.head()

In [80]:
cosmic_all = cosmic_all[pd.notnull(cosmic_all['Mutation genome position'])]

In [81]:
mr = pd.read_csv('../data_frames/mr.csv', index_col=0)
hugo_entrez = mr[['Hugo_Symbol', 'Entrez_Gene_Id']]
hugo_entrez.columns = ['Gene name', 'Entrez_Gene_Id']
hugo_entrez = hugo_entrez.drop_duplicates()

In [82]:
cosmic_all = pd.merge(cosmic_all, hugo_entrez, on='Gene name', how='outer')
#cosmic_all.head()

In [83]:
cosmic = pd.DataFrame(cosmic_all["Gene name"].value_counts())
cosmic = cosmic.reset_index()
cosmic.columns = ['Gene_Name', 'counts']
cosmic.head()
cosmic_all['Mutation zygosity'].isnull().sum()
cosmic_all['Mutation zygosity'].value_counts()

het    1372
hom      87
dtype: int64

Create table with mutation information and cosmic mutation id. This is necessary for mapping with TCGA data.

In [85]:
cosmic_raw = pd.read_csv('../data_frames/cosmic_raw_mut_id.csv', index_col=0)
cosmic_raw.columns = ["Mutation ID", "Gene name", "Location"]

In [86]:
# add entrez_id, create snp
cosmic_raw = pd.merge(cosmic_raw, hugo_entrez, on='Gene name', how='inner')
cosmic_raw.head()

Unnamed: 0,Mutation ID,Gene name,Location,Entrez_Gene_Id
0,COSM318402,A1CF,52566499,29974
1,COSM1702376,A1CF,52566505,29974
2,COSM191228,A1CF,52566514,29974
3,COSM191228,A1CF,52566514,29974
4,COSM918820,A1CF,52566549,29974


SNP2 is the cosmic reference version?

In [87]:
cosmic_raw['SNP2'] = cosmic_raw[['Entrez_Gene_Id', 'Location']].apply(combine, axis=1)

In [88]:
cosmic_raw = cosmic_raw[['Mutation ID', 'SNP2']]
cosmic_raw.to_csv('../data_frames/mut_id.csv')

##Using Cosmic Mutation Ids to Map between TCGA and Cosmic data

In [89]:
with open('../raw_data/thyr.uniq_cancer_SNPs.abrvCosmic.csv', 'r') as cosmic:
    cancer = pd.read_csv(cosmic, sep=',')
cancer['Gene name'] = cancer['Gene name'].map(simplify)
cancer['Start_Position'] = cancer['Mutation genome position'].map(get_loc)
cancer = pd.merge(cancer, hugo_entrez, on='Gene name', how='outer')
cancer = cancer[pd.notnull(cancer['Entrez_Gene_Id'])]
cancer['SNP'] = cancer[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
cancer = cancer.drop_duplicates()

In [90]:
with open('../raw_data/thyr.uniq_neutral_SNPs.abrvCosmic.csv', 'r') as cosmic:
    neutral = pd.read_csv(cosmic, sep=',')
neutral['Gene name'] = neutral['Gene name'].map(simplify)
neutral['Start_Position'] = neutral['Mutation genome position'].map(get_loc)
neutral = pd.merge(neutral, hugo_entrez, on='Gene name', how='outer')
neutral = neutral[pd.notnull(neutral['Entrez_Gene_Id'])]
neutral['SNP'] = neutral[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
# total of 826 neutral genes
neutral_df = pd.DataFrame(neutral['SNP'].value_counts()) #where you would add constraints
neutral_df.reset_index(level=0, inplace=True)
neutral_to_be_used = list(neutral_df['index'])
neutral = neutral[neutral['SNP'].isin(neutral_to_be_used)]

In [91]:
mut_id = pd.read_csv('../data_frames/mut_id.csv', index_col=0)

In [92]:
cancer = pd.merge(cancer, mut_id, on="Mutation ID", how="inner")

In [93]:
neutral = pd.merge(neutral, mut_id, on="Mutation ID", how="inner")

In [94]:
cancer_genes = cancer['SNP2'].unique()
print len(cancer_genes)
neutral_genes = neutral['SNP2'].unique()
print len(neutral_genes)
print cancer_genes[:10]

259
889
['673_140453136' '673_140453134' '673_140453142' '673_140481403'
 '673_140453143' '673_140453157' '5290_178936107' '5290_178936082'
 '5290_178952090' '5290_178952085']


In [95]:
tcga_feature_vector.head()

Unnamed: 0,SNP,counts,Perc_Bi_Allelic,Density
0,673_140453136,480,0.508333,1298.9
248,673_140453135,3,0.333333,1298.9
249,673_140477840,2,0.5,1298.9
250,673_140453193,2,0.5,1298.9
251,673_140477827,2,0.5,1298.9


In [96]:
all_genes = tcga_feature_vector.SNP.unique()
cancer_set = [gene for gene in all_genes if gene in cancer_genes]
neutral_set = [gene for gene in all_genes if gene in neutral_genes]

In [97]:
print cancer_set
print neutral_set

['673_140453136', '673_140453134', '4893_115256529', '4893_115256530', '3265_533874', '3265_533875', '3845_25380277', '3845_25380276', '3845_25398284', '3845_25398228', '207_105246551', '5290_178952074', '5728_89692941', '23405_95557629']
['2312_152283862', '285464_1388508', '285464_1388757', '996_45219311', '85301_116931099', '4261_11000848', '113146_105411781', '113146_105406238', '113146_105411700', '113146_105418344', '113146_105418391', '143_25021323', '388646_89599084', '5545_11461549', '58508_151970877', '128372_158735740', '1114_5903067', '3566_27374180', '10395_12957475', '84951_38645125', '393046_143748257', '100132386_39261693', '2074_50667105', '284348_44302624', '23177_65296798', '4666_57111948', '119180_30918549', '3683_30518041', '135114_126278230', '3911_60899206', '9611_16068463', '176_89415247', '114770_15587185', '84432_110998854', '1816_9784542', '93035_110455184', '343169_247875415', '9790_43318589', '653247_11546314', '219417_56143125', '219417_56143592', '675_329

##Complete feature vectors

In [98]:
tcga_feature_vector = pd.read_csv('../data_frames/tcga_feature_vector.csv', index_col=0)

In [99]:
tcga_feature_vector.head()

Unnamed: 0,SNP,counts,Perc_Bi_Allelic,Density
0,673_140453136,480,0.508333,1298.9
248,673_140453135,3,0.333333,1298.9
249,673_140477840,2,0.5,1298.9
250,673_140453193,2,0.5,1298.9
251,673_140477827,2,0.5,1298.9


In [100]:
cancer_feature_vector = tcga_feature_vector[tcga_feature_vector["SNP"].isin(cancer_set)]
neutral_feature_vector = tcga_feature_vector[tcga_feature_vector["SNP"].isin(neutral_set)]

In [101]:
print cancer_feature_vector.count()
print neutral_feature_vector.count()
cancer_feature_vector.head()

SNP                14
counts             14
Perc_Bi_Allelic    14
Density            14
dtype: int64
SNP                113
counts             113
Perc_Bi_Allelic    113
Density             45
dtype: int64


Unnamed: 0,SNP,counts,Perc_Bi_Allelic,Density
0,673_140453136,480,0.508333,1298.9
252,673_140453134,2,0.5,1298.9
259,4893_115256529,54,0.5,1736.6
290,4893_115256530,14,0.5,1736.6
297,3265_533874,24,0.541667,1824.5


In [102]:
# restrict the cancer and neutral feature vectors
cancer_feature_vector = cancer_feature_vector[['SNP', 'counts', 'Perc_Bi_Allelic', 'Density']]
neutral_feature_vector = neutral_feature_vector[['SNP', 'counts', 'Perc_Bi_Allelic', 'Density']]

In [103]:
cancer_feature_vector.to_csv('../data_frames/cancer_feature_vector_tcga.csv')
neutral_feature_vector.to_csv('../data_frames/neutral_feature_vector_tcga.csv')

In [106]:
cancer_feature_vector[10:14].to_csv('../data_frames/cancer_feature_vector_tcga_validation.csv')
neutral_feature_vector[10:103].to_csv('../data_frames/neutral_feature_vector_tcga_validation.csv')

In [107]:
cancer_feature_vector[:10].to_csv('../data_frames/cancer_feature_vector_tcga_training.csv')
neutral_feature_vector[:10].to_csv('../data_frames/neutral_feature_vector_tcga_training.csv')