In [16]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

Read and merge data as in the demonstration notebook (see "Gene Downselect")

In [2]:
genes = pd.read_csv('genes.csv')
gtex_gene_exp = pd.read_csv('gtex_gene_expression.csv')
tox_chem = pd.read_csv('toxicogenomics_chemicals.csv')

genes = genes[["ensembl_id","hgnc_name","cytogenetic_location","gene_biotype"]]
gtex_gene_exp = gtex_gene_exp[["gene_id","chromosome","chromosome_start","chromosome_end","score","strand_type"]]
tox_chem = tox_chem[["gene_id","chemical_id","chemical_name","gene_forms"]]

merge1 = pd.merge(genes, gtex_gene_exp, left_on = 'ensembl_id', right_on = 'gene_id')
merge1 = merge1.drop("ensembl_id",axis=1)
merge_final = merge1.merge(tox_chem, on='gene_id')

Get the 340 most important genes across all organs according to our metric (see report) and join those to our feature table

In [7]:
importantGenes = pd.read_csv('selected_gene_disease.csv')
organGeneRelation = pd.read_csv('organ_gene_relation.csv')

In [14]:
merge_relation = pd.merge(merge_final,organGeneRelation,left_on='gene_id',right_on="Gene Id")
merge_relation = merge_relation.drop("Gene Id",axis=1)
merge_relation.head()

Unnamed: 0,hgnc_name,cytogenetic_location,gene_biotype,gene_id,chromosome,chromosome_start,chromosome_end,score,strand_type,chemical_id,chemical_name,gene_forms,Organ,not_causal,Avg. Fpkm Expression
0,insulin like growth factor binding protein 5,2q35,protein_coding,ENSG00000115461,chr2,216672104,216695525,654,-,C495626,"14-deoxy-11,12-didehydroandrographolide",mRNA,Brain,False,48.99258794
1,insulin like growth factor binding protein 5,2q35,protein_coding,ENSG00000115461,chr2,216672104,216695525,654,-,C495626,"14-deoxy-11,12-didehydroandrographolide",mRNA,Breast,False,215.7425602
2,insulin like growth factor binding protein 5,2q35,protein_coding,ENSG00000115461,chr2,216672104,216695525,654,-,C495626,"14-deoxy-11,12-didehydroandrographolide",mRNA,Kidney,False,271.259913
3,insulin like growth factor binding protein 5,2q35,protein_coding,ENSG00000115461,chr2,216672104,216695525,654,-,C495626,"14-deoxy-11,12-didehydroandrographolide",mRNA,Prostate,False,84.33054034
4,insulin like growth factor binding protein 5,2q35,protein_coding,ENSG00000115461,chr2,216672104,216695525,654,-,C495626,"14-deoxy-11,12-didehydroandrographolide",mRNA,Thyroid,False,375.8365323


Convert non-numerical data to integers

In [17]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            #finding just the uniques:
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # map the new "id" value to replace the string
            df[column] = list(map(convert_to_int,df[column]))   
            
    return df

merge_relation_handled = handle_non_numerical_data(merge_relation)

In [19]:
merge_relation_handled.head()

Unnamed: 0,hgnc_name,cytogenetic_location,gene_biotype,gene_id,chromosome,chromosome_start,chromosome_end,score,strand_type,chemical_id,chemical_name,gene_forms,Organ,not_causal,Avg. Fpkm Expression
0,0,8,0,0,7,216672104,216695525,654,1,436,81,4,6,0,46
1,0,8,0,0,7,216672104,216695525,654,1,436,81,4,5,0,7
2,0,8,0,0,7,216672104,216695525,654,1,436,81,4,3,0,51
3,0,8,0,0,7,216672104,216695525,654,1,436,81,4,1,0,13
4,0,8,0,0,7,216672104,216695525,654,1,436,81,4,4,0,14


Create training and testing sets. We seek to predict the organ that is cancerous, given the training features below.

We use the Gene IDs, Chromosome, Score (related to the RPKM), Chemical ID (from the disease), the average FPKM for that Gene ID, and whether or not it passes out "causality" metric (explained in the report).

We use a 80-20 train-test split.

In [33]:
y = merge_relation_handled["Organ"]
X = merge_relation_handled[["gene_id","chromosome","score","chemical_id",'Avg. Fpkm Expression',"not_causal"]]

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2)

Create a Random Forest Classifier for the predictive modelling.

In [34]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion='gini',
                             n_estimators=50,
                             min_samples_split=8,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True)
clf.fit(Xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=8, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [36]:
pred = clf.predict(Xtest)

Now we've trained our Random Forest Classifier and used it to predict which organ is cancerous given the testing data. How well did we do?

In [46]:
accuracy = clf.score(Xtest,ytest)
print(accuracy)

0.995652173913


AWESOME! Almost 100% accuracy!! What led to our success?

In [44]:
print(Xtrain.columns)
clf.feature_importances_

Index(['gene_id', 'chromosome', 'score', 'chemical_id', 'Avg. Fpkm Expression',
       'not_causal'],
      dtype='object')


array([ 0.13291542,  0.17129129,  0.26637655,  0.01293843,  0.37804793,
        0.03843037])

It appears that the most important metrics for predicting which organ is cancerous are the FPKM and score (related to RPKM). The chromosome and gene are of secondary importance, followed by our binary causality metric and finally the chemical ID.