In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import notebook
import scipy.stats
from sklearn.metrics import mean_squared_error
%matplotlib inline
import numpy as np

import umap
import umap.plot
reducer = umap.UMAP(random_state=42)

plt.rcParams.update({'font.family': 'serif', 'font.serif': 'Arial', "figure.facecolor": 
                     'white', "axes.facecolor": 'white',
                     "savefig.facecolor": 'white'})

In [2]:
counts_file = 'data/Pancreatic/Pancreatic_RNAseq_tpm.csv'
counts_df = pd.read_csv(counts_file, index_col = 0)

# remove genes with constant values
constant_columns = counts_df.columns[counts_df.nunique() <= 1]
counts_df = counts_df.drop(columns = constant_columns)
counts_df.shape

(55, 50651)

In [3]:
viability_file = 'data/Pancreatic/viability.csv'
viability_df = pd.read_csv(viability_file, index_col = 0)
drugs = viability_df.columns
viability_df = (viability_df - viability_df.mean()) / viability_df.std()

shared_patients = list(set(counts_df.index) & set(viability_df.index))
print(len(shared_patients))
counts_df = counts_df.loc[shared_patients]
viability_df = viability_df.loc[shared_patients]

43


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

features_df = counts_df

ss = StandardScaler()
pt = PowerTransformer()

# OPTION 0 (no norm)
#features_norm_df = features_df

#OPTION 1
#features_norm_df = pd.DataFrame(pt.fit_transform(features_df), index = features_df.index, columns = features_df.columns)

# OPTION 2
#features_norm_df = pd.DataFrame(ss.fit_transform(features_df), index = features_df.index, columns = features_df.columns)

# OPTION 3 (Seurat normalization)
#features_norm_df = np.log2((features_df.T / features_df.T.sum().values) * 10000 + 1).T 

# OPTION 4
def _handle_zeros_in_scale(scale, copy=True):
    """
    This method is copied `from sklearn.preprocessing._data`
    Makes sure that whenever scale is zero, we handle it correctly.
    This happens in most scalers when we have constant features.
    """
    # if we are fitting on 1D arrays, scale might be a scalar
    if np.isscalar(scale):
        if scale == 0.0:
            scale = 1.0
        return scale
    elif isinstance(scale, np.ndarray):
        if copy:
            # New array to avoid side-effects
            scale = scale.copy()
        scale[scale == 0.0] = 1.0
        return scale

def transform_standardize(data, mean, std):
    return (data - mean) / _handle_zeros_in_scale(std, copy=False)

features_norm_df = transform_standardize(features_df, features_df.mean(), features_df.std().values)

constant_columns = features_norm_df.columns[features_norm_df.nunique() <= 1]
features_norm_df = features_norm_df.drop(columns = constant_columns)
print('Removed constant features', len(constant_columns))

Removed constant features 1000


In [5]:
feat_corr = features_norm_df.corr()

In [6]:
important_genes = pd.read_csv('feature_importance/Pancreatic/AUC_zscore/SHAP_important_genes.csv')

In [7]:
import decoupler as dc
dorothea = dc.get_dorothea()
progeny = dc.get_progeny()
dorothea_genes = list(set(dorothea.source.unique()) & set(dorothea.target.unique()))
progeny_genes = list(set(progeny.target.unique()))
net_genes = list(set(dorothea_genes+progeny_genes))
shared_genes = list(set(net_genes) & set(features_norm_df.columns))

In [8]:
t = 0.7
extra_important_genes = pd.DataFrame()
for drug, drug_rows in important_genes.groupby('drug'):
    print(drug)
    drug_genes = drug_rows.gene.values
    other_genes = (set(shared_genes) - set(drug_genes))
    rel_genes = feat_corr.loc[drug_genes][other_genes]
    for gene, row in rel_genes.iterrows():
        pos_corr = row[row >= t].index.tolist()
        neg_corr = row[row <= -t].index.tolist()
        lin_reg_coef = drug_rows[drug_rows.gene == gene].lin_reg_coef.values[0]
        shap = drug_rows[drug_rows.gene == gene].shap.values[0]
        if len(pos_corr)>0:
            extra_important_genes = extra_important_genes.append(pd.DataFrame({'drug': [drug]*len(pos_corr),
                                                                          'gene': pos_corr,
                                                                           'lin_reg_coef': lin_reg_coef,
                                                                           'shap': shap
                                                                          }))
        if len(neg_corr)>0:
            extra_important_genes = extra_important_genes.append(pd.DataFrame({'drug': [drug]*len(neg_corr),
                                                                          'gene': neg_corr,
                                                                           'lin_reg_coef': -lin_reg_coef,
                                                                           'shap': -shap
                                                                          }))
extra_important_genes = extra_important_genes.reset_index(drop = True)

5-FU
Afatinib
Bortezomib
Celecoxib
Disulfuram
Everolimus
Gemcitabine
K-ras(G12C) Inhibitor 9
KU-55933
LY2874455
Lapatinib
MK-2206
Nutlin-3
OSI-420
Olaparib
Oxaliplatin
Paclitaxel
Ruxolitinib
SB5225334
SF1670
SGI-1776
SN-38
Selumetinib
Sunitinib
TPCA-1
WIKI4


In [9]:
vc = extra_important_genes[['drug', 'gene']].value_counts()
for row in vc[vc>1].iteritems():
    to_remove = extra_important_genes[(extra_important_genes.drug == row[0][0])&(extra_important_genes.gene == row[0][1])]
    extra_important_genes = extra_important_genes.drop(index = to_remove.index)

In [10]:
important_genes.append(extra_important_genes).to_csv('feature_importance/Pancreatic/AUC_zscore/SHAP_important_genes_ext.csv')