In [1]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import sys
import json
import pickle
import json
from ast import literal_eval

from tqdm import tqdm
import itertools

from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_recall_curve, roc_curve, auc
from sklearn.metrics import make_scorer, roc_auc_score

from torchsurv.loss import cox
from lifelines.utils import concordance_index

sys.path.append('./../src/')
from utils import *
from utils_XGBMLP import *

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset


# In this notebook we obtain output of top 5 concepts for a run with seed 999

We use these outputs to calculate KM plots in Figure <>


### XGBoost derived concepts

In [2]:
# Dataset
torch.manual_seed(0)

data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]


data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df), columns=data_df.columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())


train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
epoch_ls = []
elapsed_time_ls = []
nconcepts_ls = []

seeds = [999]
test_size = 0.3
batch_size = 64


for seed in seeds:
    print("*******************")
    print(seed)

    model = torch.load('./../models/XGBMLP/ntopfeatures/XGBMLP_top100_seed'+str(seed)+'.pt')
    model.eval()
        

    X = torch.tensor(data_df.drop(['event', 'time'], axis=1).to_numpy(), dtype=torch.float32)
    e = data_df['event'].to_numpy()
    t = data_df['time'].to_numpy()

    with open('./../models/XGBMLP/ntopfeatures/concept_weights_top100_seed'+str(seed)+'.pkl','rb') as f:
        dict_to_save = pickle.load(f)
    f.close()
    
    concepts_weights_df = pd.DataFrame(dict_to_save)
    concepts_weights_df['abs_weights'] = [np.abs(i) for i in concepts_weights_df['weights']]
    concepts_weights_df['concepts'] = ['concept'+str(i) for i in range(len(concepts_weights_df))]
    concepts_weights_df = concepts_weights_df.sort_values('abs_weights',ascending=False)
    
    feature_groups = concepts_weights_df.sort_values('abs_weights', ascending=False)['feature_groups_idx'].iloc[:5].to_list()
            

*******************
999


  model = torch.load('./../models/XGBMLP/ntopfeatures/XGBMLP_top'+str(100)+'_seed'+str(seed)+'.pt')


In [4]:
concept_output, concept_fidx_ls = get_concept_layers_output(model, X, feature_groups)
concept_output = concept_output.tolist()
concept_fidx_ls

[1, 3, 2, 4, 0]

In [5]:
ConSurv_XGB_df  = pd.DataFrame(concept_output, columns=['c'+str(i) for i in concept_fidx_ls])
ConSurv_XGB_df['event'] = e
ConSurv_XGB_df['time'] = t
ConSurv_XGB_df.to_csv('./../results/Top5_Concept_op_consurvxgb_top100.csv')

### Rule

In [6]:
# Dataset
torch.manual_seed(0)

data_df = pd.read_csv('./../Data/1000_features_survival_3classes.csv',index_col=0).drop(['index', 'y'],axis=1)
data_df_event_time = data_df[['event', 'time']]


data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype='int')
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df), columns=data_df.columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())


train_ci_ls = []
valid_ci_ls = []
test_ci_ls = []
epoch_ls = []
elapsed_time_ls = []
nconcepts_ls = []

seeds = [999]
test_size = 0.3
batch_size = 64

for seed in seeds:
    print("*******************")
    print(seed)

    model = torch.load('./../models/RuleMLP/ntopfeatures/RuleMLP_catrulekit_3class_1hot_top25seed'+str(seed)+'.pt')
    model.eval()

    X = torch.tensor(data_df.drop(['event', 'time'], axis=1).to_numpy(), dtype=torch.float32)
    e = data_df['event'].to_numpy()
    t = data_df['time'].to_numpy()
    
    with open('./../models/RuleMLP/ntopfeatures/catrulekit_3class_1hot_concept_weights_top25seed'+str(seed)+'.pkl','rb') as f:
        dict_to_save = pickle.load(f)
    f.close()
    
    concepts_weights_df = pd.DataFrame(dict_to_save)
    concepts_weights_df['abs_weights'] = [np.abs(i) for i in concepts_weights_df['weights']]
    concepts_weights_df['concepts'] = ['concept'+str(i) for i in range(len(concepts_weights_df))]
    concepts_weights_df = concepts_weights_df.sort_values('abs_weights',ascending=False)
    
    feature_groups = concepts_weights_df.sort_values('abs_weights', ascending=False)['feature_groups_idx'].iloc[:5].to_list()
            

*******************
999


  model = torch.load('./../models/RuleMLP//ntopfeatures/RuleMLP_catrulekit_3class_1hot_top25seed999.pt')


In [8]:
concept_output, concept_fidx_ls = get_concept_layers_output(model, X, feature_groups)
concept_output = concept_output.tolist()
concept_fidx_ls

[0, 3, 1, 2, 4]

In [9]:
ConSurv_Rule_df  = pd.DataFrame(concept_output, columns=['c'+str(i) for i in concept_fidx_ls])
ConSurv_Rule_df['event'] = e
ConSurv_Rule_df['time'] = t
ConSurv_Rule_df.to_csv('./../results/Top5_Concept_op_consurvrule_top25.csv')