# How to use this code
## !Use Google Colab:
https://colab.research.google.com/drive/1vR5auTG_f2rbNG_gUIrRnWhLo_-6aoAV
1. Run chunk 1
2. Define the correct file path for the training data in Chunk 2 + run chunk 2
3. Run chunk 3

### To perform neste resampling:
Adapt params in chunk 4 and run it
This stores the best model as a pth file which can be ignored
### To train model given parameters and save model as .pkl:
Adapt paramt in chunk 5 as explained and then run it. The .pkl file is requiered for getting predictions on test data.
### To get model predictions on test data:
In chunk 6 define the path to the .pkl model file, as well as to expression and pData test data sets. Then run the chunk.


In [None]:
!pip install lifelines
!pip install scikit-learn==1.5.2
!pip install scikit-survival==0.23.1
# Requiered imports
import numpy as np
import copy
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import torch
from lifelines.utils import concordance_index
from sklearn.utils.validation import check_X_y, check_is_fitted
import logging
from sklearn.model_selection import train_test_split, LeaveOneGroupOut, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import check_random_state
from sksurv.util import Surv
import os
import pickle
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading formulaic-1.1.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (se

In [None]:
### Chunk 2
# Defining the pathways to the data used for model training.
# One expression data file and one pData file is needed.
# As for standard input, common genes and intersect genes are used. One is commented out.
# /content is the folder which serves as the standard upload folder in google colab
#EXPRESSION_DATA_PATH = '/content/exprs_intersect.csv'
EXPRESSION_DATA_PATH = '/content/common_genes_knn_imputed.csv'
CLINICAL_DATA_PATH = '/content/merged_imputed_pData.csv'

In [None]:
### Chunk 3
# Basic logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class DeepSurvNet(nn.Module):
    """
    PyTorch based neural network architecture designed for survival prediction.
    This network consists of fully connected layers with ReLU activation,
    dropout for regularization, and a final layer that outputs a single
    hazard prediction value.
    """

    def __init__(self, n_features, hidden_layers=[32, 16], dropout=0.2):
        super().__init__()
        layers = []
        prev_size = n_features
        self.model = None

        # Build hidden layers with ReLU activation and dropout
        for size in hidden_layers:
            layers.extend([
                nn.Linear(prev_size, size),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_size = size

        # Final layer for hazard prediction
        layers.append(nn.Linear(prev_size, 1, bias=False))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class DeepSurvModel(BaseEstimator, RegressorMixin):
    """
    Implementation of the DeepSurv model that integrates
    with scikit-learn, specifying  configurable architecture,
    training procedures, and evaluation metrics.

    The model includes:
    - Customizable neural network architecture
    - Mini-batch training with early stopping
    - CPU/GPU support
    - Concordance index evaluation
    - Compatibility with scikit-learn's cross-validation and pipeline features
    - Reproducible training through seed control

    The model follows scikit-learn's estimator interface by implementing
    fit(), predict(), get_params() and set_params() methods.
    """
    # Main model class for survival prediction
    def __init__(self, n_features=None, hidden_layers=[16, 16], dropout=0.5,
                 learning_rate=0.01, device='cpu', random_state=123,
                 batch_size=128, num_epochs=100, patience=12):
        self.n_features = n_features
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.device = device if torch.cuda.is_available() and device == 'cuda' else 'cpu'
        self.random_state = random_state
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.patience = patience

        torch.manual_seed(random_state)
        np.random.seed(random_state)

        self.scaler = StandardScaler()
        self.model = None
        self.is_fitted_ = False
        self.training_history_ = {'train_loss': [], 'val_loss': []}
        self.n_features_in_ = None

    def fit(self, X, y):
        # Train the model
        X, y = check_X_y(X, y, accept_sparse=True)

        self.n_features_in_ = X.shape[1]
        self.init_network(self.n_features_in_)
        self.model.to(self.device)
        # seed
        np.random.seed(self.random_state)
        torch.manual_seed(self.random_state)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(self.random_state)
            torch.cuda.manual_seed_all(self.random_state)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        # Create DataLoader
        train_dataset_, val_dataset_ = self._prepare_data(X, y, val_split=0.1)

        generator = torch.Generator()
        generator.manual_seed(self.random_state)

        train_loader_ = DataLoader(train_dataset_, batch_size=self.batch_size, shuffle=True, generator=generator)
        val_loader = DataLoader(val_dataset_, batch_size=32, shuffle=True, generator=generator)

        best_val_loss = float('inf')
        best_model_state = None
        counter = 0.0
        for epoch in range(self.num_epochs):
            self.model.train()
            epoch_loss_ = 0.0
            n_batches_ = 0
            for X_batch, time_batch, event_batch in train_loader_:
                loss = self._train_step(X_batch, time_batch, event_batch)
                epoch_loss_ += loss
                n_batches_ += 1
            avg_train_loss = epoch_loss_ / n_batches_
            self.training_history_['train_loss'].append(avg_train_loss)

            # Validation phase
            self.model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for X_batch, time_batch, event_batch in val_loader:
                    val_loss += self._eval_step(X_batch, time_batch, event_batch)

            val_loss = val_loss / len(val_loader)
            self.training_history_['val_loss'].append(val_loss)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = copy.deepcopy(self.model.state_dict())
                counter = 0
            else:
                counter += 1

            if counter > self.patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)

        self.is_fitted_ = True
        return self

    def predict(self, X):
        check_is_fitted(self, 'is_fitted_')
        if isinstance(X, pd.DataFrame):
            X = X.values
        X = torch.FloatTensor(X).to(self.device)
        self.model.eval()
        with torch.no_grad():
            risk_scores = self.model(X).cpu().numpy()
        return risk_scores.flatten()

    def score(self, X, y):
        check_is_fitted(self, 'is_fitted_')
        preds = self.predict(X)
        return self.c_index(-preds, y)

    def get_params(self, deep=True):
        return {
            "n_features": self.n_features,
            "hidden_layers": self.hidden_layers,
            "dropout": self.dropout,
            "learning_rate": self.learning_rate,
            "device": self.device,
            "random_state": self.random_state,
            "batch_size": self.batch_size,
            "num_epochs": self.num_epochs,
            "patience": self.patience
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def clone(self):
        super(self).clone()

    def _prepare_data(self, X, y, val_split=0.1):
        # SPlit and prepare data
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_split, random_state=self.random_state)

        X_scaled_train = X_train
        times_train = np.ascontiguousarray(y_train['time']).astype(np.float32)
        event_field_train = 'status' if 'status' in y_train.dtype.names else 'event'
        events_train = np.ascontiguousarray(y_train[event_field_train]).astype(np.float32)
        X_tensor_train = torch.FloatTensor(X_scaled_train).to(self.device)
        time_tensor_train = torch.FloatTensor(times_train).to(self.device)
        event_tensor_train = torch.FloatTensor(events_train).to(self.device)

        X_scaled_val = X_val
        times_val = np.ascontiguousarray(y_val['time']).astype(np.float32)
        event_field_val = 'status' if 'status' in y_val.dtype.names else 'event'
        events_val = np.ascontiguousarray(y_val[event_field_val]).astype(np.float32)
        X_tensor_val = torch.FloatTensor(X_scaled_val).to(self.device)
        time_tensor_val = torch.FloatTensor(times_val).to(self.device)
        event_tensor_val = torch.FloatTensor(events_val).to(self.device)

        return TensorDataset(X_tensor_train, time_tensor_train, event_tensor_train), \
              TensorDataset(X_tensor_val, time_tensor_val, event_tensor_val)


    def _negative_log_likelihood(self, risk_pred, times, events):
        # Calculate loss function
        _, idx = torch.sort(times, descending=True)
        risk_pred = risk_pred[idx]
        events = events[idx]
        log_risk = risk_pred
        risk = torch.exp(log_risk)
        cumsum_risk = torch.cumsum(risk, dim=0)
        log_cumsum_risk = torch.log(cumsum_risk + 1e-10)
        event_loss = events * (log_risk - log_cumsum_risk)
        return -torch.mean(event_loss)

    def _train_step(self, X, times, events):
        # Single training step
        self.optimizer.zero_grad()
        risk_pred = self.model(X)
        loss = self._negative_log_likelihood(risk_pred, times, events)
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def _eval_step(self, X, times, events):
        # Single evaluation step
        risk_pred = self.model(X)
        loss = self._negative_log_likelihood(risk_pred, times, events)
        return loss.item()

    def _check_early_stopping(self, counter):
        # Check early stopping conditions
        if len(self.training_history_['val_loss']) < 2:
            return 0.0

        if self.training_history_['val_loss'][-1] < self.training_history_['val_loss'][-2]:
            counter = 0.0
        else:
            counter += 1.0
        return counter

    def c_index(self, risk_pred, y):
        # Calculate concordance index
        if not isinstance(y, np.ndarray):
            y = y.detach().cpu().numpy()
        event_field = 'status' if 'status' in y.dtype.names else 'event'
        time = y['time']
        event = y[event_field]
        if not isinstance(risk_pred, np.ndarray):
            risk_pred = risk_pred.detach().cpu().numpy()
        if np.isnan(risk_pred).all():
            return np.nan
        return concordance_index(time, risk_pred, event)

    def init_network(self, n_features):
        # Initialize network and optimizer
        self.model = DeepSurvNet(n_features=n_features, hidden_layers=self.hidden_layers, dropout=self.dropout).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)


# Helper functions for survival data handling
def _get_survival_subset(y, indices):
    """Extract survival data subset"""
    subset = np.empty(len(indices), dtype=y.dtype)
    event_field = 'status' if 'status' in y.dtype.names else 'event'
    subset[event_field] = y[event_field][indices]
    subset['time'] = y['time'][indices]
    return subset

def _aggregate_results(results):
    """Aggregates nested CV results."""
    scores = [res['test_score'] for res in results]
    if np.isnan(scores).all():
        logger.warning(f"Found only NaN values in CV-results: {scores}")
        mean_score, std_score = np.nan, np.nan
    else:
        mean_score = np.nanmean(scores)
        std_score = np.nanstd(scores)

    logger.info(f"Aggregated results:")
    logger.info(f"Mean score: {mean_score:.3f} ± {std_score:.3f}")
    logger.info(f"Individual scores: {scores}")

    return {
        'mean_score': mean_score,
        'std_score': std_score,
        'fold_results': results
    }

def nested_resampling(estimator, X, y, groups, param_grid, monitor = None, ss = GridSearchCV,
                     outer_cv = LeaveOneGroupOut(), inner_cv = LeaveOneGroupOut(), scoring = None):
    """Implementation of the nested resampling logic for hyperparameter optimization"""
    # Main nested CV implementation
    logger.info("Starting nested resampling...")
    logger.info(f"Data shape: X={X.shape}, groups={len(np.unique(groups))} unique")

    outer_results = []
    splits = list(outer_cv.split(X, y, groups))

    for i, (train_idx, test_idx) in enumerate(splits):
        logger.info(f"\nOuter fold {i+1}")

        fold_seed = 42 + i
        np.random.seed(fold_seed)
        torch.manual_seed(fold_seed)

        X_train = X.iloc[train_idx]
        X_test = X.iloc[test_idx]
        y_train = _get_survival_subset(y, train_idx)
        y_test = _get_survival_subset(y, test_idx)
        train_groups = groups[train_idx] if groups is not None else None

        test_cohort = groups[test_idx][0] if groups is not None else None
        logger.info(f"Test cohort: {test_cohort}")

        inner_gcv = ss(estimator, param_grid, cv = inner_cv, refit = True, n_jobs=1, verbose = 2)
        if monitor is not None:
            inner_results = inner_gcv.fit(X_train, y_train, groups = train_groups, model__monitor = monitor)
        else:
            inner_results = inner_gcv.fit(X_train, y_train, groups = train_groups)

        inner_cv_results = inner_results.cv_results_
        inner_best_params = inner_results.best_params_

        outer_model = inner_results.best_estimator_.named_steps['model']
        test_score = outer_model.score(X_test, y_test)

        logger.info(f"Best parameters: {inner_best_params}")
        logger.info(f"Test score: {test_score:.3f}")

        outer_results.append({
            'test_cohort': test_cohort,
            'test_score': test_score,
            'best_params': inner_best_params,
            'inner_cv_results': inner_cv_results
        })

    return _aggregate_results(outer_results)

class ModellingProcess():
    """
    This class manages the entire modeling process including data preparation,
    nested cross-validation, model training, and result saving. It is a
    standardized way of modeling used for several of the implemented mode types
    and supports both simple training and complex nested resampling approaches.
    Results can be automatically saved and evaluated.
    """
    # Main class for model training pipeline
    def __init__(self) -> None:
        self.outer_cv = LeaveOneGroupOut()
        self.inner_cv = LeaveOneGroupOut()
        self.ss = GridSearchCV
        self.pipe = None
        self.cmplt_model = None
        self.cmplt_pipeline = None
        self.nrs = None
        self.X = None
        self.y = None
        self.groups = None
        self.path = None
        self.fname_cv = None

    def prepare_survival_data(self, pdata):
        # Convert input data to survival format
        status = pdata['BCR_STATUS'].astype(bool).values
        time = pdata['MONTH_TO_BCR'].astype(float).values
        y = Surv.from_arrays(
            event=status,
            time=time,
            name_event='status',
            name_time='time'
        )
        return y

    def prepare_data(self):
        # Load and preprocess data
        exprs = pd.read_csv(EXPRESSION_DATA_PATH, index_col=0)
        pdata = pd.read_csv(CLINICAL_DATA_PATH, index_col=0)

        clinical_features = ['AGE', 'TISSUE', 'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
        numeric_features = ['AGE', 'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
        categorical_features = ['TISSUE']

        clinical_preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
            ],
            remainder='passthrough'
        )


        clinical_data = clinical_preprocessor.fit_transform(pdata[clinical_features])
        clinical_df = pd.DataFrame(
            clinical_data,
            index=pdata.index
        )

        exprs.columns = exprs.columns.astype(str)
        clinical_df.columns = [f'clinical_{i}' for i in range(clinical_df.shape[1])]

        self.X = pd.concat([clinical_df, exprs], axis=1)

        target_cols = ['MONTH_TO_BCR', 'BCR_STATUS']
        target_df = pdata[target_cols]
        self.y = self.prepare_survival_data(target_df)

        self.groups = np.array([idx.split('.')[0] for idx in self.X.index])

    def do_modelling(self, pipeline_steps, config):
        # Main modeling workflow
        self._set_seed()

        if config.get("params_mp", None) is not None:
            self.set_params(config['params_mp'])

        if config.get("path", None) is None or config.get("fname_cv", None) is None:
            logger.warning("Didn't get sufficient path info for saving cv-results")
        else:
            self.path = config['path']
            self.fname_cv = config['fname_cv']

        err, mes = self._check_modelling_prerequs(pipeline_steps)
        if err:
            logger.error("Requirements setup error: %s", mes)
            raise Exception(mes)
        else:
            self.pipe = Pipeline(pipeline_steps)

        param_grid, monitor, do_nested_resampling, refit_hp_tuning = self._get_config_vals(config)

        try:
            logger.info("Start model training...")
            logger.info(f"Input data shape: X={self.X.shape}")

            if do_nested_resampling:
                logger.info("Nested resampling...")
                self.nrs = nested_resampling(self.pipe, self.X, self.y, self.groups, param_grid, monitor, self.ss, self.outer_cv, self.inner_cv)
                if (self.fname_cv is not None) and (self.path is not None):
                    self.save_results(self.path, self.fname_cv, model = None, cv_results = self.nrs, pipe = None)
        except Exception as e:
            logger.error(f"Error during nested resampling: {str(e)}")
            raise

        if refit_hp_tuning:
            try:
                logger.info("Do HP Tuning for complete model; refit + set complete model")
                self.cmplt_model = self.fit_cmplt_model(param_grid)
            except Exception as e:
                logger.error(f"Error during complete model training: {str(e)}")
                raise
        elif refit_hp_tuning is False and do_nested_resampling is False:
            logger.info("Fit complete model wo. HP tuning (on default params)")
            self.cmplt_model = self.pipe.fit(self.X, self.y)

        return self.nrs, self.cmplt_model, self.cmplt_pipeline

    def fit_cmplt_model(self, param_grid, monitor=None):
        # Fit final model with best parameters
        logger.info("Do HP Tuning for complete model")
        res = self.ss(
            estimator=self.pipe,
            param_grid=param_grid,
            cv=self.outer_cv,
            n_jobs=1,
            verbose=2,
            refit=True
        )
        if monitor is not None:
            res.fit(self.X, self.y, groups=self.groups, model__monitor=monitor)
        else:
            res.fit(self.X, self.y, groups=self.groups)
        self.resampling_cmplt = res
        self.cmplt_pipeline = res.best_estimator_
        self.cmplt_model = res.best_estimator_.named_steps['model']
        return self.cmplt_model, res


    def save_results(self, path, fname, model=None, cv_results=None, pipe=None):
        # Save model and results
        if model is None:
            logger.warning("Won't save any model, since it's not provided")
        else:
            # Save the model
            model_dir = os.path.join(path, 'model')
            os.makedirs(model_dir, exist_ok=True)
            model.model.to(torch.device('cpu'))
            torch.save(model.model, os.path.join(model_dir, f"{fname}.pth"))
            logger.info(f"Saved model to {model_dir}")

        if cv_results is None:
            logger.warning("Won't save any CV results, since it's not provided")
        else:
            # Save cross-validation results
            results_dir = os.path.join(path, 'results')
            os.makedirs(results_dir, exist_ok=True)
            results_file = os.path.join(results_dir, f"{fname}_cv.csv")
            pd.DataFrame(cv_results).to_csv(results_file)
            logger.info(f"Saved CV results to {results_file}")

    def _check_modelling_prerequs(self, pipeline_steps):
        # Validate modeling prerequisites
        err = False
        mes = ""
        if self.X is None or self.y is None:
            mes = mes + "1) Please call prepare_data() with your preferred config or set X, y, and groups"
            err = True
        if not any('model' in tup for tup in pipeline_steps):
            mes = mes + "2) Caution! Your pipeline must include a step named 'model' for the model"
            err = True
        return err, mes

    def _get_config_vals(self, config):
        # Extract configuration values
        if config.get("params_cv", None) is None:
            logger.warning("No param grid for (nested) resampling detected - will fit model with default HPs on complete data")
            return None, False, False, False
        if config.get('monitor', None) is None:
            logger.info("No additional monitoring detected")
        return config['params_cv'], config.get('monitor', None), config.get('do_nested_resampling', True), config.get('refit', True)

    def set_params(self, params):
        # Set model parameters
        for key, value in params.items():
            setattr(self, key, value)

    def _set_seed(self, seed = 1234):
        # Set random seeds for reproducibility
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        global random_state
        random_state = check_random_state(seed)





In [None]:
### Chunk 4
# Define hyperparameter grid for the nested resampling. To do so, params_cv can be adapted.
# refit and do_nested_resampling should be true
# fname_cv is the name by which the the results are stored in a csv file ->adapt

MODEL_CONFIG = {
    'params_cv'  : {
        # Define the number and size of hidden layers
        'model__hidden_layers': [[512, 256, 128, 64],[512, 256, 128],[512, 128],[512, 256], [256, 128], [1024], [512], [256], [128]],
        # Learning rate for optimization
        'model__learning_rate': [0.00001, 0.0001],
        # Batch size for training
        'model__batch_size': [64],
        # Number of training epochs
        'model__num_epochs': [500],
        # Dropout rate for regularization
        'model__dropout': [0.2, 0.4],
        'model__device': ['cuda']
    },
    'refit': True,
    'do_nested_resampling': True,
    'path' : "",
    'fname_cv' : 'result_intersection'
}


mp = ModellingProcess()
mp.prepare_data()

ds_pipeline_steps = [
    ('model', DeepSurvModel())
]

nstd_res_result, cmplt_model, cmplt_pipeline = mp.do_modelling(ds_pipeline_steps, MODEL_CONFIG)

print(nstd_res_result)

Fitting 8 folds for each of 1 candidates, totalling 8 fits
Early stopping at epoch 15
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   6.1s
Early stopping at epoch 18
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.3s
Early stopping at epoch 14
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.1s
Early stopping at epoch 16
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.3s
Early stopping at epoch 16
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001



Early stopping at epoch 16
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Early stopping at epoch 22
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.3s
Early stopping at epoch 17
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.0s
Early stopping at epoch 15
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.3s
Early stopping at epoch 19
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], model__learning_rate=0.0001, model__num_epochs=500; total time=   1.5s
Early stopping at epoch 17
[CV] END model__batch_size=64, model__device=cuda, model__dropout=0.2, model__hidden_layers=[128], 

In [None]:
### Chunk 5
# Example parameters for the model
model_params = {
    'hidden_layers': [256, 128],
    'learning_rate': 0.00001,
    'batch_size': 64,
    'num_epochs': 500,
    'dropout': 0.2,
    'device': 'cuda',
    'random_state': 123
}

# Initialize modeling process and prepare data
mp = ModellingProcess()
mp.prepare_data()

# Create a new model with the specified parameters
model_to_save = DeepSurvModel(**model_params)



# Train the model on the full dataset
model_to_save.fit(mp.X.values, mp.y)

# Generate predictions on the training data
preds_train = model_to_save.predict(mp.X.values)

# Define directory to save the model
save_dir = "/content/my_saved_model"
os.makedirs(save_dir, exist_ok=True)

# Define the file name for the saved model
model_file = os.path.join(save_dir, "deep_surv_model_common_genes_pData.pkl")

# Save the trained model to a file
with open(model_file, 'wb') as f:
    pickle.dump(model_to_save, f)




Early stopping at epoch 36


In [None]:
### Chunk 6
# 1. Load the trained model
model_file = "/content/deep_surv_model_common_genes_and_pData[256, 128].pkl"
with open(model_file, 'rb') as f:
    loaded_model = pickle.load(f)
print("Model successfully loaded.")

# 2. Load test data
# Load expression data
exprs_test = pd.read_csv('/content/example_exprs.csv', index_col=0)
# Load clinical data
test_pdata = pd.read_csv('/content/example_pData.csv', index_col=0)

# Align indices between expression and clinical data
test_pdata.index = exprs_test.index

# 3. Prepare clinical features
# Numeric features
clinical_numeric = test_pdata[['AGE', 'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']]

# Manually create categorical features (leave one variable out)
clinical_categorical = pd.DataFrame(index=test_pdata.index)
clinical_categorical['TISSUE_Fresh_frozen'] = 1
clinical_categorical['TISSUE_Snap_frozen'] = 0

# Combine all features (clinical and expression)
X_test = pd.concat([clinical_numeric, clinical_categorical, exprs_test], axis=1)



# 4. Create a survival object for evaluation
test_status = test_pdata['BCR_STATUS'].astype(bool).values
test_time = test_pdata['MONTH_TO_BCR'].astype(float).values
y_test = Surv.from_arrays(
    event=test_status,
    time=test_time,
    name_event='status',
    name_time='time'
)

# 5. Use the test DataFrame for predictions
# X_test is already a DataFrame, no conversion to numpy array needed
test_predictions = loaded_model.predict(X_test)  # Pass the DataFrame directly


# 6. Calculate the C-index on test data
test_cindex = loaded_model.c_index(-test_predictions, y_test)


# 7. Optional: Save predictions with sample IDs
results_df = pd.DataFrame({
    'sample_id': X_test.index,
    'risk_score': test_predictions
})
results_df.to_csv('/content/test_predictions_combined.csv')
print(test_cindex)



Model successfully loaded.
0.8722129189653267
