In [1]:
import random
import sys

from pipeline import experiment_pipeline
from preprocessing import load_data

random.seed(10)

arguments = sys.argv

arguments = [0, '/home/tristan/Desktop/Repos/DreamHF']
ROOT = arguments[1]

print("Loading the data...")
pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test = load_data(
    ROOT)


  from .autonotebook import tqdm as notebook_tqdm


Loading the data...


In [2]:
import os
import pathlib

import numpy as np
import pandas as pd
import sklearn
from sksurv.metrics import concordance_index_censored

import wandb
from model_evaluation import evaluate_model
from preprocessing import CLINICAL_COVARIATES, Salosensaari_processing, clr_processing
from survival_models import (
    Coxnet,
    CoxPH,
    IPCRidge_sksurv,
    sksurv_gbt,
    sksurv_gbt_optuna,
    xgb_aft,
    xgb_optuna,
    xgbse_weibull,
)

sklearn.set_config(transform_output="pandas")

In [3]:
processing = 'MI_clr'
clinical_covariates=CLINICAL_COVARIATES
n_taxa = 50

if processing == 'Salosensaari':
    X_train, X_test, y_train, y_test, test_sample_ids = Salosensaari_processing(
        pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test, clinical_covariates
    )
elif processing == 'MI_clr':
    ## Feature selection
    X_train, X_test, y_train, y_test, test_sample_ids = clr_processing(
        pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test, clinical_covariates,  n_taxa)


In [9]:
# importing metrics
import numpy as np
from optuna import create_study
from optuna.samplers import TPESampler
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.utils import estimator_html_repr
from sklearn.utils.validation import check_is_fitted
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.linear_model import CoxnetSurvivalAnalysis, CoxPHSurvivalAnalysis, IPCRidge
from sksurv.metrics import concordance_index_censored
from xgbse import XGBSEStackedWeibull
from xgbse.converters import convert_y
from xgbse.metrics import concordance_index

from xgboost_wrapper import XGBSurvival


def bind(instance, method):
    def binding_scope_fn(*args, **kwargs):
        return method(instance, *args, **kwargs)

    return binding_scope_fn


class EarlyStoppingMonitor:
    def __init__(self, window_size, max_iter_without_improvement):
        self.window_size = window_size
        self.max_iter_without_improvement = max_iter_without_improvement
        self._best_step = -1

    def __call__(self, iteration, estimator, args):
        # continue training for first self.window_size iterations
        if iteration < self.window_size:
            return False

        # compute average improvement in last self.window_size iterations.
        # oob_improvement_ is the different in negative log partial likelihood
        # between the previous and current iteration.
        start = iteration - self.window_size + 1
        end = iteration + 1
        improvement = np.mean(estimator.oob_improvement_[start:end])

        if improvement > 1e-6:
            self._best_step = iteration
            return False  # continue fitting

        # stop fitting if there was no improvement
        # in last max_iter_without_improvement iterations
        diff = iteration - self._best_step
        return diff >= self.max_iter_without_improvement

# OK for models in sksurv which predict the risk score when using self.predict()
def sksurv_risk_score(model, X_test):
    predictions = model.pipeline.predict(X_test)  # Predict the risk score
    scaler = MinMaxScaler()
    risk_score = scaler.fit_transform(predictions.reshape(-1, 1))
    #The range of this number has to be between 0 and 1, with larger numbers being associated with higher probability of having HF. The values, -Inf, Inf and NA, are not allowed.
    return risk_score.to_numpy().flatten()


def xgb_risk_score(model, X_test):  # OK for models in sksurv which predict the risk score
    # Predict the survival time, take the negative to convert to risk scores
    predictions = - model.pipeline.predict(X_test)
    scaler = MinMaxScaler()
    risk_score = scaler.fit_transform(predictions.reshape(-1, 1))
    #The range of this number has to be between 0 and 1, with larger numbers being associated with higher probability of having HF. The values, -Inf, Inf and NA, are not allowed.
    return risk_score.to_numpy().flatten()


class candidate_model:
    def __init__(self):
        self.monitor = None
        self.with_pca = False

    def cross_validation(self, X_train, y_train, n_iter):
        randsearchcv = RandomizedSearchCV(
            self.pipeline,
            self.distributions,
            random_state=0,
            n_iter=n_iter,
            n_jobs=-1,
            verbose=0,
            error_score='raise',
        )
        self.pipeline = randsearchcv.fit(X_train, y_train)
        return self

    def evaluate(self, X_train, X_test, y_train, y_test):
        """
        self.harrell_C_training = self.estimator.score(X_train, y_train)
        self.harrell_C_test = self.estimator.score(X_test, y_test)
        """
        self.harrell_C_training = concordance_index_censored(
            y_train['Event'], y_train['Event_time'], self.risk_score(X_train))[0]
        self.harrell_C_test = concordance_index_censored(
            y_test['Event'], y_test['Event_time'], self.risk_score(X_test))[0]

        return self

    def objective(trial):
        score = cross_val_score(pipeline, X, y, scoring='f1')
        f1 = score.mean()  # calculate the mean of scores
        return f1

    # maximise the score during tuning
    #study = optuna.create_study(direction="maximize")
    #study.optimize(objective, n_trials=100)  # run the objective function 100 times

    def create_pipeline(self):
        numeric_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="mean")),
                ("scaler", StandardScaler()),
            ]
        )

        categorical_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
            ]
        )

        pca_transformer = ColumnTransformer(
            transformers=[("reduce_dim", PCA(), selector(pattern="k__"))], remainder='passthrough')

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, selector(
                    dtype_exclude=["bool", "category", "Int64"])),
                ("cat", categorical_transformer, selector(
                    dtype_include=["bool", "category", "Int64"])),
            ]
        )

        regressor = Pipeline(
            steps=[("preprocessor", preprocessor), ("reduce_dim", pca_transformer), ("estimator", self.estimator)])

        with open("regressor.html", "w") as f:
            f.write(estimator_html_repr(regressor))

        regressor.fit = lambda X_train, y_train: regressor.fit(
            X_train, y_train, estimator__monitor=self.monitor
        )
        return regressor



In [10]:
class sksurv_gbt(candidate_model):
    def __init__(self):
        super().__init__()
        self.monitor = EarlyStoppingMonitor(25, 50)

        self.estimator = GradientBoostingSurvivalAnalysis()

        self.pipeline = self.create_pipeline()

        self.distributions = dict(
            reduce_dim = ['passthrough', PCA(0.95), PCA(0.98)],
            estimator__learning_rate=uniform(loc=1e-2, scale=0.4),
            estimator__max_depth=randint(2, 6),
            estimator__loss=["coxph"],
            estimator__n_estimators=randint(100, 350),
            estimator__min_samples_split=randint(2, 6),
            estimator__min_samples_leaf=randint(1, 10),
            estimator__subsample=uniform(loc=0.5, scale=0.5),
            estimator__max_leaf_nodes=randint(2, 30),
            estimator__dropout_rate=uniform(loc=0, scale=1),
        )

    def risk_score(self, X_test):
        risk_score = sksurv_risk_score(self, X_test)
        return risk_score


In [11]:

model  = sksurv_gbt()
 
model = model.cross_validation(X_train, y_train, 1)
 




In [16]:
model.risk_score(X_test)


AttributeError: 'sksurv_gbt' object has no attribute 'predict'

In [17]:
model.pipeline.predict(X_test)

array([ 0.00254693, -0.00354231,  0.00027235, ..., -0.00116553,
       -0.00188565,  0.03875545])