In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import sklearn 
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.column import encode_categorical
from sksurv.ensemble import RandomSurvivalForest
from pathlib import Path
from dotenv import load_dotenv

In [23]:
load_dotenv()
root = os.environ.get("root_folder")

In [24]:
# Load the data
pheno_df_train = pd.read_csv(root + 'train/pheno_training.csv')
readcounts_df_train = pd.read_csv(root + 'train/readcounts_training.csv')

pheno_df_test = pd.read_csv(root + 'test/pheno_test.csv')
readcounts_df_test = pd.read_csv(root + 'test/readcounts_test.csv')

NameError: name 'pd' is not defined

In [3]:
pheno_df_train.describe()

Unnamed: 0,Age,BodyMassIndex,Smoking,BPTreatment,PrevalentDiabetes,PrevalentCHD,PrevalentHFAIL,Event,Event_time,SystolicBP,NonHDLcholesterol,Sex
count,3615.0,3614.0,3599.0,3615.0,3564.0,3564.0,3564.0,3564.0,3564.0,3615.0,3608.0,3615.0
mean,49.504423,27.071102,0.232842,0.155463,0.061728,0.032828,0.026094,0.084175,13.756255,136.341473,4.086256,0.443707
std,14.848086,4.724096,0.422701,0.362396,0.240695,0.178212,0.159438,0.277689,5.686619,22.037649,1.092486,0.49689
min,24.098,15.839,0.0,0.0,0.0,0.0,0.0,0.0,-23.709,88.097,1.558,0.0
25%,36.8085,23.77425,0.0,0.0,0.0,0.0,0.0,0.0,15.74,120.0195,3.336,0.0
50%,50.909,26.402,0.0,0.0,0.0,0.0,0.0,0.0,15.82,132.962,3.99,0.0
75%,61.759,29.49975,0.0,0.0,0.0,0.0,0.0,0.0,15.881,149.0615,4.721,1.0
max,74.242,56.935,1.0,1.0,1.0,1.0,1.0,1.0,16.0,253.075,12.988,1.0


In [4]:
pheno_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3615 entries, 0 to 3614
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3615 non-null   object 
 1   Age                3615 non-null   float64
 2   BodyMassIndex      3614 non-null   float64
 3   Smoking            3599 non-null   float64
 4   BPTreatment        3615 non-null   int64  
 5   PrevalentDiabetes  3564 non-null   float64
 6   PrevalentCHD       3564 non-null   float64
 7   PrevalentHFAIL     3564 non-null   float64
 8   Event              3564 non-null   float64
 9   Event_time         3564 non-null   float64
 10  SystolicBP         3615 non-null   float64
 11  NonHDLcholesterol  3608 non-null   float64
 12  Sex                3615 non-null   int64  
dtypes: float64(10), int64(2), object(1)
memory usage: 367.3+ KB


Check for NaN, and null

In [5]:
pheno_df_train.isnull().sum()


Unnamed: 0            0
Age                   0
BodyMassIndex         1
Smoking              16
BPTreatment           0
PrevalentDiabetes    51
PrevalentCHD         51
PrevalentHFAIL       51
Event                51
Event_time           51
SystolicBP            0
NonHDLcholesterol     7
Sex                   0
dtype: int64

In [6]:
pheno_df_train.dropna(inplace=True)
pheno_df_test.dropna(inplace=True)

In [7]:
pheno_df_train = pheno_df_train.convert_dtypes()
pheno_df_train = pheno_df_train.astype({'Smoking':'category', 'PrevalentCHD':'category', 'BPTreatment':'category', 'PrevalentDiabetes':'category', 'PrevalentHFAIL':'category',
                                        'Event':'bool', 'Sex':'category'})


pheno_df_test = pheno_df_test.convert_dtypes()
pheno_df_test = pheno_df_test.astype({'Smoking':'category', 'PrevalentCHD':'category', 'BPTreatment':'category', 'PrevalentDiabetes':'category', 'PrevalentHFAIL':'category',
                                        'Event':'bool', 'Sex':'category'})


In [8]:
pheno_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3540 entries, 0 to 3614
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Unnamed: 0         3540 non-null   string  
 1   Age                3540 non-null   Float64 
 2   BodyMassIndex      3540 non-null   Float64 
 3   Smoking            3540 non-null   category
 4   BPTreatment        3540 non-null   category
 5   PrevalentDiabetes  3540 non-null   category
 6   PrevalentCHD       3540 non-null   category
 7   PrevalentHFAIL     3540 non-null   category
 8   Event              3540 non-null   bool    
 9   Event_time         3540 non-null   Float64 
 10  SystolicBP         3540 non-null   Float64 
 11  NonHDLcholesterol  3540 non-null   Float64 
 12  Sex                3540 non-null   category
dtypes: Float64(5), bool(1), category(6), string(1)
memory usage: 235.8 KB


In [9]:
pheno_df_train.head()

Unnamed: 0.1,Unnamed: 0,Age,BodyMassIndex,Smoking,BPTreatment,PrevalentDiabetes,PrevalentCHD,PrevalentHFAIL,Event,Event_time,SystolicBP,NonHDLcholesterol,Sex
0,Simulated_328,53.618,24.127,0,0,0,0,0,False,15.75,133.077,3.02,0
1,Simulated_1644,36.811,27.992,0,0,0,0,0,False,15.881,108.914,5.48,0
2,Simulated_1710,49.429,23.664,0,0,0,0,0,False,15.891,110.064,4.388,1
3,Simulated_1732,48.842,26.804,0,0,0,0,0,False,15.918,128.059,5.119,0
4,Simulated_1727,60.738,29.862,0,0,0,0,0,False,15.841,169.913,5.74,1


In [10]:
pheno_df_train.set_index('Unnamed: 0',inplace=True)
pheno_df_test.set_index('Unnamed: 0',inplace=True)

pheno_df_train = pheno_df_train.rename_axis(index=None, columns=pheno_df_train.index.name)
pheno_df_test = pheno_df_test.rename_axis(index=None, columns=pheno_df_test.index.name)

pheno_df_train.head()

Unnamed: 0,Age,BodyMassIndex,Smoking,BPTreatment,PrevalentDiabetes,PrevalentCHD,PrevalentHFAIL,Event,Event_time,SystolicBP,NonHDLcholesterol,Sex
Simulated_328,53.618,24.127,0,0,0,0,0,False,15.75,133.077,3.02,0
Simulated_1644,36.811,27.992,0,0,0,0,0,False,15.881,108.914,5.48,0
Simulated_1710,49.429,23.664,0,0,0,0,0,False,15.891,110.064,4.388,1
Simulated_1732,48.842,26.804,0,0,0,0,0,False,15.918,128.059,5.119,0
Simulated_1727,60.738,29.862,0,0,0,0,0,False,15.841,169.913,5.74,1


## Processing the microbiote data

In [11]:
readcounts_df_train = readcounts_df_train.transpose()
readcounts_df_train.columns = readcounts_df_train.iloc[0] 

In [12]:
readcounts_df_train =readcounts_df_train.drop(labels = ['Unnamed: 0'], axis = 0)
 

In [13]:
readcounts_df_train.describe()

Unnamed: 0,k__Archaea;p__;c__;o__;f__;g__;s__,k__Archaea;p__Candidatus_Korarchaeota;c__;o__;f__;g__;s__,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__;f__;g__;s__,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Acidilobales;f__Acidilobaceae;g__Acidilobus;s__Acidilobus_saccharovorans,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Acidilobales;f__Caldisphaeraceae;g__Caldisphaera;s__Caldisphaera_lagunensis,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__;s__,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Aeropyrum;s__Aeropyrum_camini,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Desulfurococcus;s__Desulfurococcus_amylolyticus,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Desulfurococcus;s__Desulfurococcus_mucosus,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Ignicoccus;s__Ignicoccus_hospitalis,...,k__Viruses;p__unclassified_bacterial_viruses;c__Streptococcus_phage_phiARI0460-1;o__;f__;g__;s__,k__Viruses;p__unclassified_bacterial_viruses;c__Streptococcus_phage_phiARI0462;o__;f__;g__;s__,k__Viruses;p__unclassified_bacterial_viruses;c__Streptococcus_phage_phiARI0468-1;o__;f__;g__;s__,k__Viruses;p__unclassified_bacterial_viruses;c__Streptococcus_phage_phiARI0468-2;o__;f__;g__;s__,k__Viruses;p__unclassified_bacterial_viruses;c__Streptococcus_phage_phiBHN167;o__;f__;g__;s__,k__Viruses;p__unclassified_bacterial_viruses;c__Synechococcus_phage_S-CAM3;o__;f__;g__;s__,k__Viruses;p__unclassified_viruses;c__Leptopilina_boulardi_filamentous_virus;o__;f__;g__;s__,k__Viruses;p__unclassified_viruses;c__Mollivirus_sibericum;o__;f__;g__;s__,k__Viruses;p__unclassified_viruses;c__Smacovirusgroup;o__;f__;g__;s__,k__Viruses;p__unclassified_viruses;c__Torulaspora_delbrueckii_dsRNA_Mbarr-1_killer_virus;o__;f__;g__;s__
count,3615,3615,3615,3615,3615,3615,3615,3615,3615,3615,...,3615,3615,3615,3615,3615,3615,3615,3615,3615,3615
unique,1,4,3,8,11,1,2,2,6,4,...,8,5,1,2,5,2,2,3,6,7
top,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
freq,3615,3574,3608,3443,3548,3615,3610,3611,3019,3550,...,3596,3595,3615,3611,3600,3613,3614,3585,3608,3554


In [14]:
readcounts_df_test = readcounts_df_test.transpose()
readcounts_df_test.columns = readcounts_df_test.iloc[0]
readcounts_df_test = readcounts_df_test.drop(labels = ['Unnamed: 0'], axis = 0)
 

## Start with a baseline model 
     

In [15]:
pheno_df_test = encode_categorical(pheno_df_test, columns = ['Smoking', 'BPTreatment', 'PrevalentDiabetes', 'PrevalentCHD', 'PrevalentHFAIL', 'Sex'])
pheno_df_train = encode_categorical(pheno_df_train, columns = ['Smoking', 'BPTreatment', 'PrevalentDiabetes', 'PrevalentCHD', 'PrevalentHFAIL', 'Sex'])

# Remove PrevalentHFAIL=1
pheno_df_test.pop('PrevalentHFAIL=1') 
pheno_df_train.pop('PrevalentHFAIL=1') 

Simulated_328     0.0
Simulated_1644    0.0
Simulated_1710    0.0
Simulated_1732    0.0
Simulated_1727    0.0
                 ... 
Simulated_1783    0.0
Simulated_3425    0.0
Simulated_1789    0.0
Simulated_1592    0.0
Simulated_1731    0.0
Name: PrevalentHFAIL=1, Length: 3540, dtype: float64

In [16]:
def test_base_model(covariates, base_model, df_train, df_test):
    
    X_train = df_train.loc[df_train.loc[:,'Event_time']>=0, covariates]
    X_test = df_test.loc[df_test.loc[:,'Event_time']>=0,covariates]    
    y_train = df_train.loc[df_train.loc[:,'Event_time']>=0,['Event', 'Event_time']]
    y_test = df_test.loc[df_test.loc[:,'Event_time']>=0,['Event', 'Event_time']]
    
    """
    X_train = df_train.loc[:, covariates]
    X_test = df_test.loc[:,covariates]    
    y_train = df_train.loc[:,['Event', 'Event_time']]
    y_test = df_test.loc[:,['Event', 'Event_time']]
    """
    y_train =y_train.to_records(index = False)
    y_test =y_test.to_records(index = False)
     
    base_model.fit(X_train, y_train)

    preds_train = base_model.predict(X_train)
    preds_test = base_model.predict(X_test)
    
    result_train = concordance_index_censored(y_train["Event"], y_train["Event_time"], preds_train)

    result_test = concordance_index_censored(y_test["Event"], y_test["Event_time"], preds_test)
    return (result_train[0], result_test[0])

### Cox model with only Age + Sex covariates

In [17]:
base_model = CoxPHSurvivalAnalysis(alpha=0, ties='breslow', n_iter=100, tol=1e-09, verbose=0)
  
covariates = ['Sex=1', 'Age']

test_base_model(covariates, base_model, pheno_df_train, pheno_df_test)

  X = check_array(X, **check_params)
  X = check_array(X, **check_params)


(0.7124037995431339, 0.7205041251202106)

### Cox model with all clinical covariates

In [18]:
base_model = CoxPHSurvivalAnalysis(alpha=0, ties='breslow', n_iter=100, tol=1e-09, verbose=0)

covariates = ['Age', 'BodyMassIndex', 'Smoking=1', 'BPTreatment=1',
       'PrevalentDiabetes=1', 'PrevalentCHD=1', 'SystolicBP', 'NonHDLcholesterol', 'Sex=1']

test_base_model(covariates, base_model, pheno_df_train, pheno_df_test)

  X = check_array(X, **check_params)
  X = check_array(X, **check_params)


(0.7158075974650258, 0.7080094577690367)

### Random forest survival model with all clinical covariates + microbiome data

Remove the columns with unique values

In [19]:
for col in readcounts_df_test.columns:
    if len(readcounts_df_test[col].unique()) == 1 and len(readcounts_df_train[col].unique()) == 1:
        readcounts_df_test.drop(col,inplace=True,axis=1)
        readcounts_df_train.drop(col,inplace=True,axis=1)

In [20]:
df_train = pheno_df_train.join(readcounts_df_train)
df_test = pheno_df_test.join(readcounts_df_test)
base_model = RandomSurvivalForest(n_estimators=100, max_depth=None, min_samples_split=6, min_samples_leaf=3)  

covariates = df_train.columns                   
test_base_model(covariates, base_model, df_train, df_test)

  X = check_array(X, **check_params)


(0.9948071613508839, 0.968922407248064)