# Inference

To make inferences with the model, first you need to format and place your files in the following directories:

- _database/processed/P1000_adjusted_TPM.csv This file will contain your sample ID-s and CpG-gene beta values.
- _database/processed/response_paper.csv This file contains the sample ID-s and the corresponding age.
- _database/splits/ This directory contains the train-test-validation set for the datasets, and by default only contains the test set of the pan-tissue dataset (even if it is named as train set). 

# Import the required packages

In [2]:
import sys
from os.path import join, dirname, realpath
#current_dir = dirname(realpath(__file__))
current_dir = "train"
from preprocessing import pre
import subprocess
sys.path.insert(0, dirname(current_dir))
import os
import imp
import logging
import random
import timeit
import datetime
import numpy as np
import tensorflow as tf
from utils.logs import set_logging, DebugFolder
import yaml
from pipeline.train_validate import TrainValidatePipeline
from pipeline.one_split import OneSplitPipeline
from pipeline.crossvalidation_pipeline import CrossvalidationPipeline
from pipeline.LeaveOneOut_pipeline import LeaveOneOutPipeline
import networkx as nx
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
#from data.data_access import Data
from data.prostate_paper.data_reader import ProstateDataPaper
from copy import deepcopy
import logging

from sklearn import svm, linear_model
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import Ridge, ElasticNet, Lasso, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from analysis.figure_3.data_extraction_utils import get_node_importance, get_link_weights_df_, \
    get_data, get_degrees, adjust_coef_with_graph_degree, get_pathway_names
from model.coef_weights_utils import get_deep_explain_scores
from os import makedirs
from os.path import dirname, realpath, exists
import pickle
from model.model_utils import get_coef_importance
from model import nn
from analysis.figure_3.data_extraction_utils import get_node_importance, get_link_weights_df_, \
    get_data, get_degrees, adjust_coef_with_graph_degree, get_pathway_names
from utils.loading_utils import DataModelLoader
from xai_age_utils import *
#from config_path import PROSTATE_LOG_PATH, POSTATE_PARAMS_PATH
LOG_PATH = "_logs/XAI-AGE/RUN1/crossvalidation_average_reg_10_tanh"
POSTATE_PARAMS_PATH = "train/"

Using TensorFlow backend.


# Define the type of run (a modified crossvalidation_average_reg_10_tanh by default based on Elmarakeby et al.)

In [None]:
params_file_list = []


params_file_list.append('./crossvalidation_average_reg_10_tanh')
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

random_seed = 234
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_random_seed(random_seed)

timeStamp = '_{0:%b}-{0:%d}_{0:%H}-{0:%M}'.format(datetime.datetime.now())
def elapsed_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
for params_file in params_file_list:
    log_dir = join(LOG_PATH, params_file)
    log_dir = log_dir
    set_logging(log_dir)
    params_file = join(POSTATE_PARAMS_PATH, params_file)
    logging.info('random seed %d' % random_seed)
    params_file_full = params_file + '.py'
    print params_file_full
    params = imp.load_source(params_file, params_file_full)

    DebugFolder(log_dir)
    if params.pipeline['type'] == 'one_split':
        pipeline = OneSplitPipeline(task=params.task, data_params=params.data, model_params=params.models,
                                    pre_params=params.pre, feature_params=params.features,
                                    pipeline_params=params.pipeline,
                                    exp_name=log_dir)

    elif params.pipeline['type'] == 'crossvalidation':
        pipeline = CrossvalidationPipeline(task=params.task, data_params=params.data, feature_params=params.features,
                                           model_params=params.models, pre_params=params.pre,
                                           pipeline_params=params.pipeline, exp_name=log_dir)
    elif params.pipeline['type'] == 'Train_Validate':
        pipeline = TrainValidatePipeline(data_params=params.data, model_params=params.models, pre_params=params.pre,
                                         feature_params=params.features, pipeline_params=params.pipeline,
                                         exp_name=log_dir)

    elif params.pipeline['type'] == 'LOOCV':
        pipeline = LeaveOneOutPipeline(task=params.task, data_params=params.data, feature_params=params.features,
                                       model_params=params.models, pre_params=params.pre,
                                       pipeline_params=params.pipeline, exp_name=log_dir)
    start = timeit.default_timer()
    #pipeline.run()

# Load the weights

In [1]:


for data_params in pipeline.data_params:
    data_id = data_params['id']
    # logging
    logging.info('loading data..1..')
    data = Data(**data_params)

    x_train, x_validate_, x_test_, y_train, y_validate_, y_test_, info_train, info_validate_, info_test_, cols = data.get_train_validate_test()

    X = np.concatenate((x_train, x_validate_), axis=0)
    y = np.concatenate((y_train, y_validate_), axis=0)
    info = np.concatenate((info_train, info_validate_), axis=0)

    # get model
    logging.info('fitting model ...')
    
    
   

    for model_param in pipeline.model_params:
        if 'id' in model_param:
            model_name = model_param['id']
        else:
            model_name = model_param['type']

        #set_random_seeds(random_seed=20080808)
        model_name = model_name + '_' + data_id
        m_param = deepcopy(model_param)
        m_param['id'] = model_name
        logging.info('fitting model ...')
# load model
# load model and data --------------------
#model_dir = join(base_dir, model_name)
from model import model_factory
model_file = 'XAI_AGE_ALL'
params_file = join("_logs/XAI-AGE/RUN1/crossvalidation_average_reg_10_tanh/", model_file + '_params.yml')
print(params_file)
loader = DataModelLoader(params_file)
nn_model = loader.get_model(model_file)
feature_names = nn_model.feature_names



NameError: name 'pipeline' is not defined

# Make predictions

In [5]:
y_pred_test, y_pred_test_scores = predict(nn_model, X, y)
predictions = pd.DataFrame({'ID': info,'Label':np.concatenate(y).ravel(),'Prediction':np.concatenate(y_pred_test).ravel()})
predictions = predictions.drop_duplicates(subset=['ID'])


Here you can save your predictions to a csv file:

In [8]:
predictions.to_csv("", index = False)

# Extracting features

In [None]:
LOG_PATH = "_logs/XAI-AGE/RUN1/crossvalidation_average_reg_10_tanh"

In [9]:

#current_dir = dirname(dirname(realpath(__file__)))
current_dir = os.path.dirname(os.path.abspath("__file__"))

sys.path.insert(0, dirname(current_dir))

#from config_path import *


samplename = ''
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

current_dir = os.path.dirname(os.path.abspath("__file__"))

saving_dir = join(current_dir, 'extracted')

if not exists(saving_dir):
    makedirs(saving_dir)


base_dir = join(LOG_PATH, '')
model_name = ''

importance_type = ['deepexplain_deeplift']
target = 'o6'
use_data = 'Test'  # {'All', 'Train', 'Test'}
dropAR = False

layers = ['inputs', 'h0', 'h1', 'h2', 'h3', 'h4', 'h5', 'o_linear6']



# load model and data --------------------
model_dir = join(base_dir, model_name)
model_file = 'XAI_AGE_ALL'
params_file = join(model_dir, model_file + '_params.yml')
print(params_file)
loader = DataModelLoader(params_file)
nn_model = loader.get_model(model_file)
feature_names = nn_model.feature_names
X, Y, info = get_data(loader, use_data, dropAR)
response = pd.DataFrame(Y, index=info, columns=['response'])
print(response.head())
filename = join(saving_dir, 'response.csv')
response.to_csv(filename)
#
print('saving gradeint importance')
# #gradeint importance --------------------
#node_weights_, node_weights_samples_dfs = get_node_importance(nn_model, X, Y, importance_type[0], target)



    
model = nn_model.model



x_train = X
y_train = Y
info_train = info
importance_type_new=importance_type[0]

#ret = get_coef_importance(model, x_train, y_train, target=target, feature_importance=importance_type_new, detailed=True)
method = importance_type_new.split('_')[1]
ret = get_deep_explain_scores(model, x_train, y_train, target, method_name=method, detailed=True)
print(type(ret))
if type(ret) is tuple:
    coef, coef_detailed = ret
    print('coef_detailed', len(coef_detailed))

else:
    coef = ret
    # empty
    coef_detailed = [c.T for c in coef]

node_weights_dfs = {}
node_weights_samples_dfs = {}

for i, k in enumerate(nn_model.feature_names.keys()):
    name = nn_model.feature_names[k]
    w = coef[k]
    w_samples = coef_detailed[k]
    features = get_pathway_names(name)
    df = pd.DataFrame(abs(w.ravel()), index=name, columns=['coef'])
    layer = pd.DataFrame(index=name)
    layer['layer'] = i
    # node_weights_dfs.append(df)
    node_weights_dfs[k] = df
    # layers.append(layer)
    df_samples = pd.DataFrame(w_samples, columns=features)
    # node_weights_samples_dfs.append(df_samples)
    node_weights_samples_dfs[k] = (df_samples)


node_weights_=node_weights_dfs
node_weights_samples_dfs = node_weights_samples_dfs

save_gradient_importance(node_weights_, node_weights_samples_dfs, info_train, str(samplename))
#
print('saving link weights')
# # link weights --------------------
link_weights_df = get_link_weights_df_(nn_model.model, feature_names, layers)
save_link_weights(link_weights_df, layers[1:])
#
print('saving activation')
# # activation --------------------
#layer_outs_dict = nn_model.get_layer_outputs(X)
#save_activation(layer_outs_dict, feature_names, info_train)
#
print('saving graph stats')
# # graph stats --------------------
stats = get_degrees(link_weights_df, layers[1:])
import numpy as np
keys = np.sort(stats.keys())
for k in keys:
    filename = join(saving_dir, 'graph_stats_{}.csv'.format(k))
    stats[k].to_csv(filename)
# save_graph_stats(degrees,fan_outs, fan_ins)
#
print('adjust weights with graph stats')

# # graph stats --------------------
degrees = []
for k in keys:
    degrees.append(stats[k].degree.to_frame(name='coef_graph'))

node_importance = adjust_coef_with_graph_degree(node_weights_, stats, layers[1:-1], saving_dir)

with open('extracted_data.pkl', 'w') as f:  # Python 3: open(..., 'wb')
        pickle.dump([keys, stats, node_weights_,layers,node_importance], f)

filename = join(saving_dir, 'node_importance_graph_adjusted_test.csv')
node_importance.to_csv(filename)


Your predictions will be saved to the **extracted** folder.