In [1]:
import io
from IPython.nbformat import current
def execute_notebook(nbfile):
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    ip = get_ipython()
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("/cellar/users/ramarty/Projects/hla_ii/bin/imports.ipynb")


- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)


Populating the interactive namespace from numpy and matplotlib


## Optimize prediction of tumor types by somatic mutations

Goals: <br>
1. Determine number of patients without driver mutations <br>
2. Determine increase of driver space if extended to frequent mutations across all tumor types individually <br>
3. Cluster tumor types by patient mutation frequencies <br>
4. Predictions on new clusters

### Current state of patients with driver mutations

In [2]:
patient_mutations = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/clean_matrices/combined_classes/patient_mutations.csv', index_col=0)

In [3]:
patient_mutations.sum(axis=1).value_counts()

0     3014
1     2540
2     1020
3      341
4       89
5       37
6       15
7        4
8        4
12       4
16       3
15       2
13       2
9        1
19       1
17       1
11       1
21       1
23       1
10       1
14       1
22       1
27       1
dtype: int64

### Possible increase in driver space <br>
Ignore known cancer genes and only go off of frequency

In [4]:
patient_tissues = pd.read_csv('/cellar/users/ramarty/Data/hla_ii/presentation/clean_matrices/patient_tissues.csv', index_col=0)
all_patients = list(patient_tissues.index.unique())

In [5]:
cancer_genes = [x.strip() for x in open('/cellar/users/ramarty/Data/hla/git_data/data/onco_genes.txt').readlines()] + \
[x.strip() for x in open('/cellar/users/ramarty/Data/hla/git_data/data/tumor_suppressor_genes.txt').readlines()]

In [6]:
len(cancer_genes)

202

In [7]:
df = pd.read_csv('/cellar/users/ramarty/Data/hla/mutations/processed_mutation_files.full_tcga.all_mutation_types.csv', index_col=0)

In [8]:
# Only keep missense mutations
df = df[df.Variant_Classification == 'Missense_Mutation']

In [9]:
# Drop duplicates
df = df.drop_duplicates(['Barcode', 'combined'])

In [10]:
df_cancer_genes = df[df.Hugo_Symbol.isin(cancer_genes)]

In [11]:
df.head()

Unnamed: 0,Tumor_Sample_Barcode,Hugo_Symbol,Variant_Classification,Protein_position,Amino_acids,Chromosome,Start_Position,Strand,Reference_Allele,Tumor_Seq_Allele2,Transcript_ID,Barcode,combined
0,TCGA-FX-A3NJ-01A-11D-A21Q-09,INTS3,Missense_Mutation,500/1042,P/R,chr1,153761659,+,C,G,ENST00000318967,TCGA-FX-A3NJ,INTS3_P500R
3,TCGA-FX-A3NJ-01A-11D-A21Q-09,PLD5,Missense_Mutation,536/536,V/L,chr1,242089859,+,C,G,ENST00000442594,TCGA-FX-A3NJ,PLD5_V536L
7,TCGA-FX-A3NJ-01A-11D-A21Q-09,GC,Missense_Mutation,406/474,L/R,chr4,71754456,+,A,C,ENST00000273951,TCGA-FX-A3NJ,GC_L406R
9,TCGA-FX-A3NJ-01A-11D-A21Q-09,PSMG4,Missense_Mutation,65/123,T/A,chr6,3263702,+,A,G,ENST00000438998,TCGA-FX-A3NJ,PSMG4_T65A
10,TCGA-FX-A3NJ-01A-11D-A21Q-09,BCLAF1,Missense_Mutation,307/920,P/S,chr6,136277962,+,G,A,ENST00000531224,TCGA-FX-A3NJ,BCLAF1_P307S


In [12]:
len(df.Barcode.unique())

10177

In [13]:
new_mutations = list(df.combined.value_counts()[df.combined.value_counts() > 3].index)

In [14]:
oldish_mutations = list(df_cancer_genes.combined.value_counts()[df_cancer_genes.combined.value_counts() > 1].index)

In [15]:
len(new_mutations), len(oldish_mutations)

(8459, 3733)

In [16]:
len(df[df.combined.isin(new_mutations)].Barcode.unique()), len(df[df.combined.isin(oldish_mutations)].Barcode.unique())

(7635, 6571)

#### Proof of concept in breast cancer

In [17]:
tissue_specific_patients = list(patient_tissues[patient_tissues.Tissue == 'BRCA'].index)
len(tissue_specific_patients)

1097

In [18]:
tissue_df = df[df.Barcode.isin(tissue_specific_patients)]
cancer_tissue_df = df_cancer_genes[df_cancer_genes.Barcode.isin(tissue_specific_patients)]

In [19]:
tissue_repeated_mutations = tissue_df.combined.value_counts()[tissue_df.combined.value_counts() > 1].index
tissue_repeated_cancer_mutations = cancer_tissue_df.combined.value_counts()[cancer_tissue_df.combined.value_counts() > 1].index
print 'All repeated mutations:', len(tissue_repeated_mutations)
print 'Cancer related repeated mutations:', len(tissue_repeated_cancer_mutations)

All repeated mutations: 550
Cancer related repeated mutations: 95


In [20]:
# coverage of breast cancer patients
print len(tissue_df[tissue_df.combined.isin(tissue_repeated_mutations)].Barcode.unique())
print len(tissue_df[tissue_df.combined.isin(tissue_repeated_cancer_mutations)].Barcode.unique())

725
479


In [21]:
# create features
tissue_mutation_features = pd.DataFrame(np.zeros((len(all_patients), len(tissue_repeated_mutations))))
tissue_mutation_features.columns = tissue_repeated_mutations
tissue_mutation_features.index = all_patients
for x in df[df.combined.isin(tissue_repeated_mutations)].iterrows():
    if x[1]['Barcode'] in all_patients:
        tissue_mutation_features.set_value(x[1]['Barcode'], x[1]['combined'], 1)

In [22]:
# create limited (cancer) features
tissue_mutation_cancer_features = pd.DataFrame(np.zeros((len(all_patients), len(tissue_repeated_cancer_mutations))))
tissue_mutation_cancer_features.columns = tissue_repeated_cancer_mutations
tissue_mutation_cancer_features.index = all_patients
for x in df[df.combined.isin(tissue_repeated_cancer_mutations)].iterrows():
    if x[1]['Barcode'] in all_patients:
        tissue_mutation_cancer_features.set_value(x[1]['Barcode'], x[1]['combined'], 1)

In [35]:
# create labels
def is_tissue(x):
    if x == 'BRCA':
        return 1
    else:
        return 0
patient_tissues['is_tissue'] = patient_tissues['Tissue'].apply(is_tissue)
weight_shift = float(len(patient_tissues[patient_tissues.is_tissue == 0]))/len(patient_tissues[patient_tissues.is_tissue == 1])

Test model with all mutations

In [36]:
weight_shift

8.989972652689152

In [37]:
X = tissue_mutation_features.as_matrix()
Y = patient_tissues.is_tissue
clf = RandomForestClassifier(max_depth=2, random_state=0, oob_score=True, class_weight={1:weight_shift, 0:1})
clf.fit(X, Y)
clf.oob_score_

0.86841865133680085

In [38]:
patient_tissues.index.values

array(['TCGA-02-0001', 'TCGA-02-0003', 'TCGA-02-0004', ..., 'TCGA-ZU-A8S4',
       'TCGA-ZX-AA5X', 'edu_clinical'], dtype=object)

In [39]:
pd.Series(clf.predict(tissue_mutation_features.ix[tissue_specific_patients])).value_counts()

0    761
1    336
dtype: int64

Test model with limited mutations

In [40]:
X = tissue_mutation_cancer_features.as_matrix()
Y = patient_tissues.is_tissue
clf = RandomForestClassifier(max_depth=2, random_state=0, oob_score=True, class_weight={1:weight_shift, 0:1})
clf.fit(X, Y)
clf.oob_score_

0.85664750433433712

In [42]:
pd.Series(clf.predict(tissue_mutation_cancer_features.ix[tissue_specific_patients])).value_counts()

0    860
1    237
dtype: int64