In [1]:
import random
import sys

from pipeline import experiment_pipeline
from preprocessing import load_data

random.seed(10)

arguments = sys.argv

arguments = [0, '/home/tristan/Desktop/Repos/DreamHF']
ROOT = arguments[1]

print("Loading the data...")
pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test = load_data(
    ROOT)


  from .autonotebook import tqdm as notebook_tqdm


Loading the data...


In [2]:
import os
import pathlib

import numpy as np
import pandas as pd
import sklearn
from sksurv.metrics import concordance_index_censored

import wandb
from model_evaluation import evaluate_model
from preprocessing import CLINICAL_COVARIATES, Salosensaari_processing, clr_processing
from survival_models import (
    Coxnet,
    CoxPH,
    IPCRidge_sksurv,
    sksurv_gbt,
    sksurv_gbt_optuna, 
    xgb_optuna,
    xgbse_weibull,
)

sklearn.set_config(transform_output="pandas")

In [3]:
processing = 'MI_clr'
clinical_covariates=CLINICAL_COVARIATES
n_taxa = 50

if processing == 'Salosensaari':
    X_train, X_test, y_train, y_test, test_sample_ids = Salosensaari_processing(
        pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test, clinical_covariates
    )
elif processing == 'MI_clr':
    ## Feature selection
    X_train, X_test, y_train, y_test, test_sample_ids = clr_processing(
        pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test, clinical_covariates,  n_taxa)


In [4]:
# importing metrics
import numpy as np
from optuna import create_study
from optuna.samplers import TPESampler
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.utils import estimator_html_repr
from sklearn.utils.validation import check_is_fitted
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.linear_model import CoxnetSurvivalAnalysis, CoxPHSurvivalAnalysis, IPCRidge
from sksurv.metrics import concordance_index_censored
from xgbse import XGBSEStackedWeibull
from xgbse.converters import convert_y
from xgbse.metrics import concordance_index

from xgboost_wrapper import XGBSurvival


def bind(instance, method):
    def binding_scope_fn(*args, **kwargs):
        return method(instance, *args, **kwargs)

    return binding_scope_fn


class EarlyStoppingMonitor:
    def __init__(self, window_size, max_iter_without_improvement):
        self.window_size = window_size
        self.max_iter_without_improvement = max_iter_without_improvement
        self._best_step = -1

    def __call__(self, iteration, estimator, args):
        # continue training for first self.window_size iterations
        if iteration < self.window_size:
            return False

        # compute average improvement in last self.window_size iterations.
        # oob_improvement_ is the different in negative log partial likelihood
        # between the previous and current iteration.
        start = iteration - self.window_size + 1
        end = iteration + 1
        improvement = np.mean(estimator.oob_improvement_[start:end])

        if improvement > 1e-6:
            self._best_step = iteration
            return False  # continue fitting

        # stop fitting if there was no improvement
        # in last max_iter_without_improvement iterations
        diff = iteration - self._best_step
        return diff >= self.max_iter_without_improvement



In [5]:
from survival_models import sksurv_model
from sklearn import model_selection

In [6]:
class sksurv_gbt_optuna(sksurv_model):
    def __init__(self):
        super().__init__()
        
        # Optuna
        self.RS = 124  # random state
        # XGBoost
        self.EARLY_STOPPING_ROUNDS = 50
        self.MULTIVARIATE = True

        self.sampler = TPESampler(seed=self.RS, multivariate=self.MULTIVARIATE)
         
        self.estimator = GradientBoostingSurvivalAnalysis()
        
        self.pipeline = self.create_pipeline()
        
    def cross_validation(self, X_train, y_train, n_iter):
        self.N_TRIALS = n_iter

        study = create_study(direction="maximize", sampler=self.sampler)
        study.optimize(
            lambda trial: self.objective(
                trial,
                X_train,
                y_train,
                n_jobs=-1,
            ),
            n_trials=self.N_TRIALS,
            n_jobs=-1,
        )
        self.optimal_hp = study.best_params
        self.pipeline.set_params(**self.optimal_hp)
        self.pipeline = self.pipeline.fit(X_train, y_train)
        return self

    def objective(
        self,
        trial,
        X_train,
        y_train,
        n_jobs=-1,
    ):
        
        params = {
            "reduce_dim": trial.suggest_categorical("reduce_dim", ['passthrough', PCA(0.95), PCA(0.98)]),
            "estimator__learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.4, log=False),
            "estimator__max_depth": trial.suggest_int("max_depth", 2, 6),
            "estimator__loss": "coxph",
            "estimator__n_estimators": trial.suggest_int("n_estimators", 100, 350),
            "estimator__min_samples_split":  trial.suggest_int("min_samples_split", 2, 6),
            "estimator__min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 10),
            "estimator__subsample": trial.suggest_float("subsample", 0.4, 0.8, log=False),
            "estimator__max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 30),
            "estimator__dropout_rate": trial.suggest_float("dropout_rate", 0, 1, log=False),
        }
        self.pipeline.set_params(**params)
        score = model_selection.cross_val_score(self.pipeline, X_train, y_train, n_jobs=-1, cv=3)
        accuracy = score.mean()
        return score


In [16]:

model  =  sksurv_gbt()
model.pipeline.fit(X_train, y_train)


TypeError: candidate_model.create_pipeline.<locals>.<lambda>() got an unexpected keyword argument 'estimator__monitor'

In [7]:

model  =  sksurv_gbt_optuna()




In [8]:
model.monitor

<survival_models.EarlyStoppingMonitor at 0x7fe441abbbb0>

In [38]:
model.estimator(monitor = model.monitor)


TypeError: 'GradientBoostingSurvivalAnalysis' object is not callable

In [15]:
model.pipeline.fit

<function survival_models.candidate_model.create_pipeline.<locals>.<lambda>(X_train, y_train)>

In [13]:
model.pipeline.fit(X_train, y_train)

TypeError: candidate_model.create_pipeline.<locals>.<lambda>() got an unexpected keyword argument 'estimator__monitor'

In [28]:
study = create_study(direction="maximize", sampler=model.sampler)
model.N_TRIALS = 1
study.optimize(
    lambda trial: model.objective(
        trial,
        X_train,
        y_train,
        n_jobs=-1,
    ),
    n_trials=model.N_TRIALS,
    n_jobs=-1,
)

[32m[I 2023-01-02 14:53:40,712][0m A new study created in memory with name: no-name-3cde507b-1754-4b78-a3f0-6bbef9f2eff8[0m


KeyboardInterrupt: 

In [23]:
study.best_params


ValueError: No trials are completed yet.

In [None]:
 
model = model.cross_validation(X_train, y_train, 1)
 


In [16]:
model.risk_score(X_test)


AttributeError: 'sksurv_gbt' object has no attribute 'predict'

In [17]:
model.pipeline.predict(X_test)

array([ 0.00254693, -0.00354231,  0.00027235, ..., -0.00116553,
       -0.00188565,  0.03875545])