#Create Training Feature Vector

This script takes data from the cosmic database about cancer mutations and creates feature vectors on the mutations that will be used for both training and validation.

In [1]:
%pylab inline
import pandas as pd
import numpy as np
import os
import sklearn as sk
from sklearn.cluster import AffinityPropagation
from sklearn import datasets
from sklearn import metrics
from multiprocessing import Pool
import pickle

Populating the interactive namespace from numpy and matplotlib


##Import and clean up whole data set - Use COSMIC

In [2]:
def simplify(x):
    a = x.split("_")
    return a[0]
def get_loc(x):
    c = x.split(":")
    s = c[1].split("-")
    return s[0]
def combine(x):
    gene = x[0]
    loc = x[1]
    return str(int(gene)) + "_" + str(loc)

In [82]:
cosmic_all = pd.read_csv(open('THYROIDonly_noTCGA_uniqMutationID.csv', 'r'))
cosmic_all['Gene name'] = cosmic_all['Gene name'].map(simplify)
cosmic_all['Start_Position'] = cosmic_all['Mutation genome position'].map(get_loc)
#cosmic_all.head()

In [83]:
cosmic_all = cosmic_all[pd.notnull(cosmic_all['Mutation genome position'])]

In [3]:
mr = pd.read_csv('mr.csv', index_col=0)
hugo_entrez = mr[['Hugo_Symbol', 'Entrez_Gene_Id']]
hugo_entrez.columns = ['Gene name', 'Entrez_Gene_Id']
hugo_entrez = hugo_entrez.drop_duplicates()

In [85]:
cosmic_all = pd.merge(cosmic_all, hugo_entrez, on='Gene name', how='outer')
#cosmic_all.head()

In [146]:
cosmic = pd.DataFrame(cosmic_all["Gene name"].value_counts())
cosmic = cosmic.reset_index()
cosmic.columns = ['Gene_Name', 'counts']
cosmic.head()
cosmic_all['Mutation zygosity'].isnull().sum()
cosmic_all['Mutation zygosity'].value_counts()

het    1372
hom      87
dtype: int64

the number of BRAF mutations make me skeptical, should probably add a column for nonsense... TODO

In [86]:
# import the new cosmic data set, create table with snp(entrez_loc) and cosmic id
# use cosmic id to superimpose SNP2 onto training/validation sets

In [110]:
cosmic_raw = pd.read_csv('cosmic_raw_mut_id.csv', index_col=0)
cosmic_raw.columns = ["Mutation ID", "Gene name", "Location"]

In [111]:
# add entrez_id, create snp
cosmic_raw = pd.merge(cosmic_raw, hugo_entrez, on='Gene name', how='inner')
cosmic_raw.head()

Unnamed: 0,Mutation ID,Gene name,Location,Entrez_Gene_Id
0,COSM318402,A1CF,52566499,29974
1,COSM1702376,A1CF,52566505,29974
2,COSM191228,A1CF,52566514,29974
3,COSM191228,A1CF,52566514,29974
4,COSM918820,A1CF,52566549,29974


In [112]:
cosmic_raw['SNP2'] = cosmic_raw[['Entrez_Gene_Id', 'Location']].apply(combine, axis=1)

In [113]:
cosmic_raw = cosmic_raw[['Mutation ID', 'SNP2']]
cosmic_raw.to_csv('mut_id.csv')

##Import positive and negative set - Use TCGA

Create two training sets; one with features based on cosmic data and the other with features based on tcga data

In [57]:
with open('thyr.uniq_cancer_SNPs.abrvCosmic.csv', 'r') as cosmic:
    cancer = pd.read_csv(cosmic, sep=',')
cancer['Gene name'] = cancer['Gene name'].map(simplify)
cancer['Start_Position'] = cancer['Mutation genome position'].map(get_loc)
cancer = pd.merge(cancer, hugo_entrez, on='Gene name', how='outer')
cancer = cancer[pd.notnull(cancer['Entrez_Gene_Id'])]
cancer['SNP'] = cancer[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
cancer = cancer.drop_duplicates()

In [58]:
# adding a new column with snp2 information (other reference)
pc = pd.read_csv('pc.csv', index_col=0)
pc.columns = ["SNP2", "Mutation AA"]
pc['Entrez_Gene_Id'] = pc['SNP2'].apply(simplify)
pc = pc[pd.notnull(pc['SNP2'])]
pc = pc[pd.notnull(pc['Mutation AA'])]
pc.Entrez_Gene_Id.value_counts()
pc[pc.Entrez_Gene_Id == '673'][:5]

Unnamed: 0,SNP2,Mutation AA,Entrez_Gene_Id
36,673_140453136,p.V600E,673
50,673_140453136,p.V600E,673
78,673_140453136,p.V600E,673
110,673_140453136,p.V600E,673
147,673_140453136,p.V600E,673


In [61]:
pc['Entrez_Gene_Id'] = pc['Entrez_Gene_Id'].astype(float) 

In [63]:
cancer = pd.merge(cancer, pc, on=["Entrez_Gene_Id","Mutation AA"], how='inner')
cancer = cancer.drop_duplicates()

In [64]:
cancer[['Gene name', 'Mutation AA', 'SNP', 'SNP2', 'Entrez_Gene_Id']].head()

Unnamed: 0,Gene name,Mutation AA,SNP,SNP2,Entrez_Gene_Id
0,BRAF,p.V600E,673_140753336,673_140453136,673
89,BRAF,p.V600E,673_140753336,673_140453135,673
249,BRAF,p.K601E,673_140753334,673_140453134,673
250,PIK3CA,p.M1043I,5290_179234286,5290_178952074,5290
251,NRAS,p.Q61R,4893_114713908,4893_115256529,4893


In [67]:
cancer[pd.notnull(cancer['SNP2'])].head()

Unnamed: 0,Gene name,Accession Number,Primary site,Mutation ID,Mutation CDS,Mutation AA,Mutation Description,Mutation zygosity,Mutation genome position,Mutation strand,SNP,FATHMM prediction,Mutation somatic status,Pubmed_PMID,ID_STUDY,Start_Position,Entrez_Gene_Id,SNP2
0,BRAF,1559390,thyroid,COSM476,c.1799T>A,p.V600E,Substitution - Missense,,7:140753336-140753336,-,673_140753336,CANCER,Reported in another cancer sample as somatic,21175381,,140753336,673,673_140453136
89,BRAF,1559390,thyroid,COSM476,c.1799T>A,p.V600E,Substitution - Missense,,7:140753336-140753336,-,673_140753336,CANCER,Reported in another cancer sample as somatic,21175381,,140753336,673,673_140453135
249,BRAF,2230480,thyroid,COSM478,c.1801A>G,p.K601E,Substitution - Missense,,7:140753334-140753334,-,673_140753334,CANCER,Reported in another cancer sample as somatic,25120313,,140753334,673,673_140453134
250,PIK3CA,227,thyroid,COSM773,c.3129G>T,p.M1043I,Substitution - Missense,het,3:179234286-179234286,+,5290_179234286,CANCER,Confirmed somatic variant,16288007,,179234286,5290,5290_178952074
251,NRAS,S61918,thyroid,COSM584,c.182A>G,p.Q61R,Substitution - Missense,,1:114713908-114713908,-,4893_114713908,CANCER,Reported in another cancer sample as somatic,12727991,,114713908,4893,4893_115256529


In [66]:
cancer.count()

Gene name                   16
Accession Number            16
Primary site                16
Mutation ID                 16
Mutation CDS                16
Mutation AA                 16
Mutation Description        16
Mutation zygosity            2
Mutation genome position    16
Mutation strand             16
SNP                         16
FATHMM prediction           16
Mutation somatic status     16
Pubmed_PMID                 16
ID_STUDY                     0
Start_Position              16
Entrez_Gene_Id              16
SNP2                        16
dtype: int64

In [73]:
with open('thyr.uniq_neutral_SNPs.abrvCosmic.csv', 'r') as cosmic:
    neutral = pd.read_csv(cosmic, sep=',')
neutral['Gene name'] = neutral['Gene name'].map(simplify)
neutral['Start_Position'] = neutral['Mutation genome position'].map(get_loc)
neutral = pd.merge(neutral, hugo_entrez, on='Gene name', how='outer')
neutral = neutral[pd.notnull(neutral['Entrez_Gene_Id'])]
neutral['SNP'] = neutral[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
# total of 826 neutral genes
neutral_df = pd.DataFrame(neutral['SNP'].value_counts()) #where you would add constraints
neutral_df.reset_index(level=0, inplace=True)
neutral_to_be_used = list(neutral_df['index'])
neutral = neutral[neutral['SNP'].isin(neutral_to_be_used)]
neutral.count()

Gene name                   10875
Accession Number             1067
Primary site                 1067
Mutation ID                  1067
Mutation CDS                 1067
Mutation AA                  1067
Mutation Description         1067
Mutation zygosity              14
Mutation genome position     1067
Mutation strand              1067
SNP                         10875
FATHMM prediction            1067
Mutation somatic status      1067
Pubmed_PMID                    25
ID_STUDY                     1042
Start_Position               1067
Entrez_Gene_Id              10875
dtype: int64

In [81]:
pc[pc["Entrez_Gene_Id"] == 7757]

Unnamed: 0,SNP2,Mutation AA,Entrez_Gene_Id


In [70]:
neutral = pd.merge(neutral, pc, on=["Entrez_Gene_Id","Mutation AA"], how='inner')
neutral = neutral.drop_duplicates()

In [72]:
neutral.head()

Unnamed: 0,Gene name,Accession Number,Primary site,Mutation ID,Mutation CDS,Mutation AA,Mutation Description,Mutation zygosity,Mutation genome position,Mutation strand,SNP,FATHMM prediction,Mutation somatic status,Pubmed_PMID,ID_STUDY,Start_Position,Entrez_Gene_Id,SNP2


In [71]:
cancer_genes = cancer['SNP2'].unique()
print len(cancer_genes)
print cancer_genes
neutral_genes = neutral['SNP2'].unique()
print len(neutral_genes)

13
['673_140453136' '673_140453135' '673_140453134' '5290_178952074'
 '4893_115256529' '4893_115256530' '3265_533875' '3265_533874'
 '3845_25398284' '3845_25380276' '3845_25380277' '23405_95557629'
 '207_105246551']
0


In [173]:
all_genes = mr.SNP.unique()
cancer_set = [gene for gene in all_genes if gene in cancer_genes]
neutral_set = [gene for gene in all_genes if gene in neutral_genes]

In [175]:
print cancer_set
print neutral_set

['3265_533874', '3265_533875']
['8174_501738', '4588_1027811']


In [89]:
# get entrez ids for cancer set and neutral set
cancer_set_entrez = [int(hugo_entrez[hugo_entrez["Hugo_Symbol"] == gene].Entrez_Gene_Id) for gene in cancer_set]
neutral_set_entrez = [int(hugo_entrez[hugo_entrez["Hugo_Symbol"] == gene].Entrez_Gene_Id) for gene in neutral_set]

##Try Number two

In [4]:
with open('thyr.uniq_cancer_SNPs.abrvCosmic.csv', 'r') as cosmic:
    cancer = pd.read_csv(cosmic, sep=',')
cancer['Gene name'] = cancer['Gene name'].map(simplify)
cancer['Start_Position'] = cancer['Mutation genome position'].map(get_loc)
cancer = pd.merge(cancer, hugo_entrez, on='Gene name', how='outer')
cancer = cancer[pd.notnull(cancer['Entrez_Gene_Id'])]
cancer['SNP'] = cancer[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
cancer = cancer.drop_duplicates()

In [5]:
with open('thyr.uniq_neutral_SNPs.abrvCosmic.csv', 'r') as cosmic:
    neutral = pd.read_csv(cosmic, sep=',')
neutral['Gene name'] = neutral['Gene name'].map(simplify)
neutral['Start_Position'] = neutral['Mutation genome position'].map(get_loc)
neutral = pd.merge(neutral, hugo_entrez, on='Gene name', how='outer')
neutral = neutral[pd.notnull(neutral['Entrez_Gene_Id'])]
neutral['SNP'] = neutral[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
# total of 826 neutral genes
neutral_df = pd.DataFrame(neutral['SNP'].value_counts()) #where you would add constraints
neutral_df.reset_index(level=0, inplace=True)
neutral_to_be_used = list(neutral_df['index'])
neutral = neutral[neutral['SNP'].isin(neutral_to_be_used)]

In [6]:
mut_id = pd.read_csv('mut_id.csv', index_col=0)

In [7]:
cancer = pd.merge(cancer, mut_id, on="Mutation ID", how="inner")

In [8]:
neutral = pd.merge(neutral, mut_id, on="Mutation ID", how="inner")

In [9]:
cancer_genes = cancer['SNP2'].unique()
print len(cancer_genes)
neutral_genes = neutral['SNP2'].unique()
print len(neutral_genes)

259
889


In [10]:
all_genes = mr.SNP.unique()
cancer_set = [gene for gene in all_genes if gene in cancer_genes]
neutral_set = [gene for gene in all_genes if gene in neutral_genes]

In [11]:
print cancer_set
print neutral_set

['207_105246551', '673_140453134', '673_140453136', '23405_95557629', '3265_533874', '3265_533875', '3845_25380276', '3845_25380277', '3845_25398284', '4893_115256529', '4893_115256530', '5290_178952074', '3845_25398228', '5728_89692941']
['176_89415247', '113146_105406238', '113146_105411700', '113146_105411781', '113146_105418344', '113146_105418391', '364_33385690', '364_33385852', '364_33386510', '83858_1431165', '9790_43318589', '675_32906729', '56673_8942942', '284018_65988049', '765_9009444', '10241_46926615', '339184_20768763', '996_45219311', '23177_65296798', '1114_5903067', '1118_203186947', '4261_11000848', '26047_147183121', '85301_116931099', '285464_1388508', '285464_1388757', '55118_99625319', '115265_101108877', '10395_12957475', '1816_9784542', '2074_50667105', '2312_152283862', '2327_171168584', '2569_89926962', '388646_89599084', '55876_38062217', '341567_48723324', '3105_29911296', '3105_29911306', '3115_33048599', '3123_32557449', '9394_129025860', '3064_3234980',

##Complete feature vectors

In [12]:
tcga_feature_vector = pd.read_csv('tcga_feature_vector.csv', index_col=0)

In [13]:
tcga_feature_vector.head()

Unnamed: 0,SNP,counts,Perc_Bi_Allelic,Density
0,673_140453136,480,0.508333,1298.9
248,673_140453135,3,0.333333,1298.9
249,673_140477840,2,0.5,1298.9
250,673_140453193,2,0.5,1298.9
251,673_140477827,2,0.5,1298.9


In [14]:
cancer_feature_vector = tcga_feature_vector[tcga_feature_vector["SNP"].isin(cancer_set)]
neutral_feature_vector = tcga_feature_vector[tcga_feature_vector["SNP"].isin(neutral_set)]

In [15]:
print cancer_feature_vector.count()
print neutral_feature_vector.count()
cancer_feature_vector.head()

SNP                14
counts             14
Perc_Bi_Allelic    14
Density            14
dtype: int64
SNP                113
counts             113
Perc_Bi_Allelic    113
Density             45
dtype: int64


Unnamed: 0,SNP,counts,Perc_Bi_Allelic,Density
0,673_140453136,480,0.508333,1298.9
252,673_140453134,2,0.5,1298.9
259,4893_115256529,54,0.5,1736.6
290,4893_115256530,14,0.5,1736.6
297,3265_533874,24,0.541667,1824.5


In [16]:
# restrict the cancer and neutral feature vectors
cancer_feature_vector = cancer_feature_vector[['SNP', 'counts', 'Perc_Bi_Allelic', 'Density']]
neutral_feature_vector = neutral_feature_vector[['SNP', 'counts', 'Perc_Bi_Allelic', 'Density']]

In [17]:
cancer_feature_vector.to_csv('cancer_feature_vector_tcga.csv')
neutral_feature_vector.to_csv('neutral_feature_vector_tcga.csv')

In [23]:
cancer_feature_vector[:4].to_csv('cancer_feature_vector_tcga_validation.csv')
neutral_feature_vector[:103].to_csv('neutral_feature_vector_tcga_validation.csv')

In [24]:
cancer_feature_vector[4:].to_csv('cancer_feature_vector_tcga_training.csv')
neutral_feature_vector[103:].to_csv('neutral_feature_vector_tcga_training.csv')

We should have normalized by gene length... fail!!

In [168]:
# try adding all of the other neutral back in... it shouldn't matter for a support vector