In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from lifelines import CoxPHFitter

# Read data

In [2]:
df_train = pd.read_csv('train.csv')
df_train = pd.DataFrame(df_train)

df_test = pd.read_csv('test.csv')
df_test = pd.DataFrame(df_test)

In [3]:
df_train.head()

Unnamed: 0,Sex,Age,Size,Site,Grade,Lymph.nodes.examined,Lymph.nodes.positive,T,N,M,Stage,duration,event
0,Male,41,65,Ileocecal Junction,II,29,1,T3,N1a,M0,IIIB,110,0
1,Male,54,95,Ascending Colon,II,37,0,T4a,N0,M0,IIB,107,0
2,Female,78,65,Sigmoid Colon,II,20,2,T4b,N1b,M0,IIIC,106,0
3,Female,33,150,"Large Intestine, NOS",II,9,0,T4a,N0,M0,IIB,105,0
4,Male,58,100,Descending Colon,III,6,1,T4a,N1a,M0,IIIB,102,0


In [4]:
# to label encoding categorical variables
le = LabelEncoder() #label encoder
cat_list = ['Sex','Site', 'Grade', 'T', 'N', 'M', 'Stage'] #list of categorical variables
for cat in cat_list:
    df_train[cat] = le.fit_transform(df_train[cat])
    df_test[cat] = le.fit_transform(df_test[cat])

df_train.head()

Unnamed: 0,Sex,Age,Size,Site,Grade,Lymph.nodes.examined,Lymph.nodes.positive,T,N,M,Stage,duration,event
0,1,41,65,3,1,29,1,2,2,0,7,110,0
1,1,54,95,0,1,37,0,4,0,0,3,107,0
2,0,78,65,7,1,20,2,5,3,0,8,106,0
3,0,33,150,4,1,9,0,4,0,0,3,105,0
4,1,58,100,1,2,6,1,4,2,0,7,102,0


In [5]:
cph = CoxPHFitter()
cph.fit(df_train, duration_col='duration', event_col='event')
cph.concordance_index_

0.7343125551461047

In [25]:
cph.score?

In [6]:
# External
cph.score(df_test, scoring_method="concordance_index")

0.6642561983471075

# Bootstrap

In [8]:
# Internal
in_tem = []
for i in range(1000):
    in_samps = df_train.sample(n=100, replace=True,ignore_index= True)
    c_tem = cph.score(in_samps, scoring_method="concordance_index")
    in_tem.append(c_tem)

in_mean = np.mean(in_tem)
in_se = np.std(in_tem) / np.power(1000,0.5)
print("Bootstrap Internal validation:")
print("The C-index:{}".format(in_mean))
print("The C-index Low 95% CI:{}".format(in_mean - 1.96 * in_se))
print("The C-index High 95% CI:{}".format(in_mean + 1.96 * in_se))

Bootstrap Internal validation:
The C-index:0.7354206697393617
The C-index Low 95% CI:0.7332246406105529
The C-index High 95% CI:0.7376166988681705


In [9]:
# External
ex_tem = []
for i in range(1000):
    ex_samps = df_test.sample(n=100, replace=True, ignore_index= True)
    c_tem = cph.score(ex_samps, scoring_method="concordance_index")
    ex_tem.append(c_tem)

ex_mean = np.mean(ex_tem)
ex_se = np.std(ex_tem) / np.power(1000,0.5)
print("Bootstrap External validation:")
print("The C-index:{}".format(ex_mean))
print("The C-index Low 95% CI:{}".format(ex_mean - 1.96 * ex_se))
print("The C-index High 95% CI:{}".format(ex_mean + 1.96 * ex_se))

Bootstrap External validation:
The C-index:0.6638263053661655
The C-index Low 95% CI:0.6607537102791238
The C-index High 95% CI:0.6668989004532072


# LASSO var

In [13]:
df_train_lasso = df_train[["Age","Size","Site","Grade","Lymph.nodes.examined","Lymph.nodes.positive","T","N","M","Stage","duration","event"]]
df_test_lasso = df_test[["Age","Size","Site","Grade","Lymph.nodes.examined","Lymph.nodes.positive","T","N","M","Stage","duration","event"]]
df_train_lasso.head()

Unnamed: 0,Age,Size,Site,Grade,Lymph.nodes.examined,Lymph.nodes.positive,T,N,M,Stage,duration,event
0,41,65,3,1,29,1,2,2,0,7,110,0
1,54,95,0,1,37,0,4,0,0,3,107,0
2,78,65,7,1,20,2,5,3,0,8,106,0
3,33,150,4,1,9,0,4,0,0,3,105,0
4,58,100,1,2,6,1,4,2,0,7,102,0


In [14]:
cph = CoxPHFitter()
cph.fit(df_train_lasso, duration_col='duration', event_col='event')
cph.concordance_index_

0.7346758706596772

In [15]:
# External
cph.score(df_test_lasso, scoring_method="concordance_index")

0.6637396694214877

In [16]:
# Internal
in_tem = []
for i in range(1000):
    in_samps = df_train_lasso.sample(n=100, replace=True,ignore_index= True)
    c_tem = cph.score(in_samps, scoring_method="concordance_index")
    in_tem.append(c_tem)

in_mean = np.mean(in_tem)
in_se = np.std(in_tem) / np.power(1000,0.5)
print("Bootstrap Internal validation:")
print("The C-index:{}".format(in_mean))
print("The C-index Low 95% CI:{}".format(in_mean - 1.96 * in_se))
print("The C-index High 95% CI:{}".format(in_mean + 1.96 * in_se))

Bootstrap Internal validation:
The C-index:0.7342913862896576
The C-index Low 95% CI:0.7320741596535361
The C-index High 95% CI:0.7365086129257792


In [17]:
# External
ex_tem = []
for i in range(1000):
    ex_samps = df_test_lasso.sample(n=100, replace=True, ignore_index= True)
    c_tem = cph.score(ex_samps, scoring_method="concordance_index")
    ex_tem.append(c_tem)

ex_mean = np.mean(ex_tem)
ex_se = np.std(ex_tem) / np.power(1000,0.5)
print("Bootstrap External validation:")
print("The C-index:{}".format(ex_mean))
print("The C-index Low 95% CI:{}".format(ex_mean - 1.96 * ex_se))
print("The C-index High 95% CI:{}".format(ex_mean + 1.96 * ex_se))

Bootstrap External validation:
The C-index:0.6631962761496667
The C-index Low 95% CI:0.6601561137151516
The C-index High 95% CI:0.6662364385841818


# TNM

In [18]:
df_train_tnm = df_train[["T","N","M","Stage","duration","event"]]
df_test_tnm = df_test[["T","N","M","Stage","duration","event"]]
df_train_tnm.head()

Unnamed: 0,T,N,M,Stage,duration,event
0,2,2,0,7,110,0
1,4,0,0,3,107,0
2,5,3,0,8,106,0
3,4,0,0,3,105,0
4,4,2,0,7,102,0


In [19]:
cph = CoxPHFitter()
cph.fit(df_train_tnm, duration_col='duration', event_col='event')
cph.concordance_index_

0.6687081538381688

In [20]:
# External
cph.score(df_test_tnm, scoring_method="concordance_index")

0.6707128099173554

In [21]:
# Internal
in_tem = []
for i in range(1000):
    in_samps = df_train_tnm.sample(n=100, replace=True,ignore_index= True)
    c_tem = cph.score(in_samps, scoring_method="concordance_index")
    in_tem.append(c_tem)

in_mean = np.mean(in_tem)
in_se = np.std(in_tem) / np.power(1000,0.5)
print("Bootstrap Internal validation:")
print("The C-index:{}".format(in_mean))
print("The C-index Low 95% CI:{}".format(in_mean - 1.96 * in_se))
print("The C-index High 95% CI:{}".format(in_mean + 1.96 * in_se))

Bootstrap Internal validation:
The C-index:0.6716735909288538
The C-index Low 95% CI:0.669177989208674
The C-index High 95% CI:0.6741691926490336


In [22]:
# External
ex_tem = []
for i in range(1000):
    ex_samps = df_test_tnm.sample(n=100, replace=True, ignore_index= True)
    c_tem = cph.score(ex_samps, scoring_method="concordance_index")
    ex_tem.append(c_tem)

ex_mean = np.mean(ex_tem)
ex_se = np.std(ex_tem) / np.power(1000,0.5)
print("Bootstrap External validation:")
print("The C-index:{}".format(ex_mean))
print("The C-index Low 95% CI:{}".format(ex_mean - 1.96 * ex_se))
print("The C-index High 95% CI:{}".format(ex_mean + 1.96 * ex_se))

Bootstrap External validation:
The C-index:0.6709610672738849
The C-index Low 95% CI:0.6681332908101328
The C-index High 95% CI:0.673788843737637
