# Predict personalised driver genes

In [1]:
# Import 
import os
import pandas as pd
import random
import numpy as np
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import recall_score
import pickle
import multiprocessing as mp
import logging
import glob
import sys

In [2]:
# Load dependent modules
PATH = "/data/malvika/PIVOT"
DATAPATH = "/data/malvika/PIVOT/data"

os.chdir(PATH + "/code")
sys.path.append(PATH + "/code")
import snv_classifier as snv
import rna_classifier as rna
import multiomic_classifier as moc

# Define variables. To be set for each run
n_threads = 20                                      # number of threads
ctype = "BRCA"                                      # cancer-type
folderpath = "/output/GDC_{}/predict/multiomic".format(ctype)     # output folder path
os.makedirs(PATH + folderpath, exist_ok=True)
random.seed(3)                                      # random seed
tr_frac = 0.7                                       # train-test fraction
lab_type = "bailey"
feat_num = "some"
model = "BalBag"
logging.basicConfig(level=logging.INFO)

# Set paths
path_network = DATAPATH + "/network"
path_domains = DATAPATH + "/domains/pfam"


In /home/malvika/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/malvika/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/malvika/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/malvika/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/malvika/anaconda3/lib/python3.7/site-packages/m

In [3]:
# Load data
# SNV
os.chdir(DATAPATH + "/GDC_{}/SNV".format(ctype))
fname="{}_snv.tsv".format(ctype)
data_snv = pd.read_csv(fname, sep="\t", header=0)
data_snv["Tumor_Sample_Barcode"] = [samp[:16] for samp in data_snv["Tumor_Sample_Barcode"]]
# RNA
rnapath = DATAPATH + "/GDC_{}/RNA-seq".format(ctype)
# CNV
os.chdir(DATAPATH + "/GDC_{}/CNV".format(ctype))
fname="{}_cnv.tsv".format(ctype)
data_cnv = pd.read_csv(fname, sep="\t", header=0, index_col=0)
# miRNA
os.chdir(DATAPATH + "/GDC_{}/miRNA".format(ctype))
fname="{}_miRNA.tsv".format(ctype)
data_miRNA = pd.read_csv(fname, sep="\t", header=0, index_col=0)
# TODO
logging.info("Loaded data")


INFO:root:Loaded data


In [4]:
# Get sample list
sample_list = list(data_snv.Tumor_Sample_Barcode.unique())

In [5]:
%%time
# Get features
(data_all, all_meta) = moc.get_multiX(data_snv, rnapath, data_cnv,
                                       data_miRNA, sample_list, DATAPATH,
                                       ctype, lab_type, 
                                       n_threads=n_threads*2, train=False)
logging.info("Ran get_multiX")
logging.info(feat_num)
(X_data, X_meta) = moc.dropNa(data_all, all_meta, feat_num)

samp_num = len(X_meta.Tumor_Sample_Barcode.unique())


INFO:root:Got snv labels
INFO:root:Got CNV labels
INFO:root:merged data
INFO:root:Got snv features
INFO:root:Dropped columns and filled na


hsa-mir-190b
hsa-mir-301b
hsa-mir-375
hsa-mir-429
hsa-mir-592


INFO:root:miRNA features
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

CPU times: user 22min 40s, sys: 11min 2s, total: 33min 43s
Wall time: 43min 24s


In [6]:
samp_num

972

In [7]:
len(sample_list)

984

In [8]:
logging.info("Loading model and output")
# Load model
modelpath = "/output/GDC_{}/multiomic".format(ctype)     # output folder path
os.chdir(PATH + modelpath +"/{}_{}_{}".format(lab_type, feat_num, model))
featFName = "model_{}_{}_{}.pkl".format(lab_type, feat_num, model)
with open(featFName, 'rb') as f:
    gs = pickle.load(f)

INFO:root:Loading model and output


In [9]:
%%time
# Get predictions
logging.info("Building models")
X_pred = gs.predict(X_data.drop(["Label"], axis=1))

INFO:root:Building models


CPU times: user 1.71 s, sys: 3.72 s, total: 5.43 s
Wall time: 8min 29s


In [10]:
os.makedirs(PATH + folderpath +"/{}_{}_{}".format(lab_type, feat_num, model), exist_ok=True)
os.chdir(PATH + folderpath + "/{}_{}_{}".format(lab_type, feat_num, model))

In [11]:
# Save predictions
tr = X_meta.copy()
tr["Predicted label"] = X_pred
temp = gs.predict_proba(X_data.drop(["Label"], axis=1))
for idx, p_class in enumerate(gs.classes_):
    col_name = "Probability_{}".format(p_class)
    tr[col_name] = temp[:,idx]
tr = tr[tr["Predicted label"] != "Neutral"]
tr.to_csv("PIVOT_predictions.tsv", header=True, index=True,
          sep="\t")

In [26]:
# Save feature matrix
featurepath = "/data/GDC_{}/feature_matrix/multiomic".format(ctype)     # output folder path
os.makedirs(PATH + featurepath, exist_ok=True)
os.chdir(PATH + featurepath)
featFName = "feat_{}_{}_{}.tsv".format(ctype, lab_type, feat_num)
X_data.to_csv(featFName, sep="\t", header=True, index=True)
metaFName = "meta_{}_{}_{}.tsv".format(ctype, lab_type, feat_num)
X_meta.to_csv(metaFName, sep="\t", header=True, index=True)