In [1]:
%pylab inline
import pandas as pd
import numpy as np
import os
import sklearn as sk
from sklearn.cluster import AffinityPropagation
from sklearn import datasets
from sklearn import metrics
from multiprocessing import Pool
import pickle

Populating the interactive namespace from numpy and matplotlib


In [3]:
mr = pd.read_csv('mr.csv', index_col=0)
edges = pickle.load(open("edges.p", 'rb'))

In [4]:
mr.head()
print mr.columns
mr.Variant_Classification.value_counts()

Index([u'Hugo_Symbol', u'Entrez_Gene_Id', u'Patient_Barcode',
       u'Variant_Classification', u'Both_Alleles', u'Start_Position', u'SNP'],
      dtype='object')


Missense_Mutation         15831
Silent                     9067
RNA                         821
Splice_Site                 500
Frame_Shift_Del             404
Nonsense_Mutation           274
Frame_Shift_Ins              97
In_Frame_Del                 88
Translation_Start_Site       21
In_Frame_Ins                  8
Nonstop_Mutation              5
dtype: int64

In [5]:
def combine(x):
    gene = x[0]
    loc = x[1]
    return str(gene) + "_" + str(loc)

In [6]:
mr['SNP'] = mr[['Entrez_Gene_Id', 'Start_Position']].apply(combine, axis=1)
mr.to_csv('mr.csv')

In [7]:
mutations = pd.DataFrame(mr.SNP.value_counts())
mutations = mutations.reset_index()
mutations.columns = ['SNP', 'counts']
mutations.head()

Unnamed: 0,SNP,counts
0,673_140453136,480
1,4893_115256529,54
2,3265_533874,24
3,2312_152280782,17
4,4893_115256530,14


In [103]:
bi_all, missense, nonsense, silent, total = [], [], [], [], []
for gene in mutations.SNP:
    bi_all.append(mr[(mr['SNP'] == gene) & mr['Both_Alleles'] == True].count()[0])
    missense.append(mr[(mr['SNP'] == gene) & (mr['Variant_Classification'] == 'Missense_Mutation')].count()[0])
    nonsense.append(mr[(mr['SNP'] == gene) & (mr['Variant_Classification'] == 'Nonsense_Mutation')].count()[0])
    silent.append(mr[(mr['SNP'] == gene) & (mr['Variant_Classification'] == 'Silent')].count()[0])
    total.append(mr[mr['SNP'] == gene].count()[0])

In [104]:
print bi_all[:10]
print total[:10]
print missense[:10]
print nonsense[:10]
mutations['Perc_Silent'] = pd.Series(silent)/pd.Series(total)
mutations['Perc_Nonsense'] = pd.Series(nonsense)/pd.Series(total)
mutations['Perc_Bi_Allelic'] = pd.Series(bi_all)/pd.Series(total)
mutations['Perc_Missense'] = pd.Series(missense)/pd.Series(total)
mutations.head(10)

[244, 27, 13, 17, 7, 12, 11, 11, 10, 6]
[480, 54, 24, 17, 14, 12, 11, 11, 10, 9]
[480, 54, 24, 17, 14, 12, 11, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Unnamed: 0,SNP,counts,Perc_Silent,Perc_Nonsense,Perc_Bi_Allelic,Perc_Missense
0,673_140453136,480,0,0,0.508333,1
1,4893_115256529,54,0,0,0.5,1
2,3265_533874,24,0,0,0.541667,1
3,2312_152280782,17,0,0,1.0,1
4,4893_115256530,14,0,0,0.5,1
5,5781_112892407,12,0,0,1.0,1
6,2312_152281479,11,0,0,1.0,1
7,2051_142562074,11,1,0,1.0,0
8,7716_56056604,10,1,0,1.0,0
9,1176_115238635,9,1,0,0.666667,0


In [105]:
def get_entrez(x):
    c = x.split("_")
    return c[0]

In [106]:
# add a column for entrez gene id
mutations['Entrez_Gene_Id'] = mutations['SNP'].map(get_entrez)
# re-arrange columns
mutations = mutations[['SNP', 'counts', 'Entrez_Gene_Id', 'Perc_Bi_Allelic', 'Perc_Missense', 'Perc_Silent', 'Perc_Nonsense']]

In [107]:
mutations.to_csv('mutations.csv')

In [8]:
mutations = pd.read_csv('mutations.csv', index_col=0)
mutations.head()

Unnamed: 0,SNP,counts,Entrez_Gene_Id,Perc_Bi_Allelic,Perc_Missense,Perc_Silent,Perc_Nonsense
0,673_140453136,480,673,0.508333,1,0,0
1,4893_115256529,54,4893,0.5,1,0,0
2,3265_533874,24,3265,0.541667,1,0,0
3,2312_152280782,17,2312,1.0,1,0,0
4,4893_115256530,14,4893,0.5,1,0,0


In [9]:
mutations.count()

SNP                19951
counts             19951
Entrez_Gene_Id     19951
Perc_Bi_Allelic    19951
Perc_Missense      19951
Perc_Silent        19951
Perc_Nonsense      19951
dtype: int64

##Adding densities to complete the tcga feature set

Let's just start with everything we need.

In [2]:
mr = pd.read_csv('mr.csv', index_col=0)
edges = pickle.load(open("edges.p", 'rb'))
mutations = pd.read_csv('mutations.csv', index_col=0)
densities = pd.read_csv('density_values.txt', sep=' ', header=None)
pc = pd.read_csv('pc.csv', index_col=0)

In [3]:
densities.columns = ["Entrez_Gene_Id", "Density"]
densities.head()

Unnamed: 0,Entrez_Gene_Id,Density
0,673,1298.9
1,2312,135.1
2,4893,1736.6
3,113146,
4,94025,14.2


In [4]:
mutations = pd.merge(mutations, densities, on='Entrez_Gene_Id', how='outer')

In [5]:
pc.head()

Unnamed: 0,SNP,Protein_Change
0,342926_53740670,p.G437D
1,79892_121612707,p.A144_splice
2,113612_108866315,p.A227V
3,220388_85396623,p.E184G
4,26057_73957562,p.P1928L


In [6]:
mutations = pd.merge(mutations, pc, on='SNP', how='outer')
mutations = mutations.drop_duplicates()

In [7]:
mutations.head(10)

Unnamed: 0,SNP,counts,Entrez_Gene_Id,Perc_Bi_Allelic,Perc_Missense,Perc_Silent,Perc_Nonsense,Density,Protein_Change
0,673_140453136,480,673,0.508333,1.0,0.0,0,1298.9,p.V600E
248,673_140453135,3,673,0.333333,0.333333,0.666667,0,1298.9,p.V600E
249,673_140477840,2,673,0.5,1.0,0.0,0,1298.9,
250,673_140453193,2,673,0.5,1.0,0.0,0,1298.9,
251,673_140477827,2,673,0.5,0.0,0.0,0,1298.9,p.PTPQQ490del
252,673_140453134,2,673,0.5,1.0,0.0,0,1298.9,p.K601E
253,673_140453140,2,673,0.0,0.0,0.0,0,1298.9,p.598_599insKIGDFGLA
254,673_140453140,2,673,0.0,0.0,0.0,0,1298.9,p.N581_splice
255,673_140477839,1,673,1.0,1.0,0.0,0,1298.9,
256,673_140481431,1,673,0.0,0.0,1.0,0,1298.9,p.V459V


In [10]:
print mutations.count()
mutations = mutations[["SNP", "counts", "Perc_Bi_Allelic", "Density"]]

SNP                19984
counts             19952
Perc_Bi_Allelic    19952
Density             8684
dtype: int64


Add column for amino acid change? Or maybe it's supposed to be elsewhere?

In [11]:
mutations.to_csv('tcga_feature_vector.csv')