In [None]:
!pip install lifelines 

import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
df=pd.read_csv('veteran.csv')
print(df.head())
print(df.info())

In [None]:
#Seperate first and second row per patient: Y=1 is death, Y=0 is start or censoring
df_event=df[df['Y']==1]
kmf=KaplanMeierFitter()

#Example: overall KM curve
kmf.fit(durations=df_event['TIME'],event_observed=df_event['Y'])
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve (Overall)')
plt.show()

#Example by treatment group
for trt_group in df_event['trt'].unique():
    mask=df_event['trt']==trt_group
    kmf.fit(df_event[mask]['TIME'],df_event[mask]['Y'],label=str(trt_group))
    kmf.plot_survival_function()

plt.title('KM Survival Curves by Treatment')
plt.show()

In [None]:
for trt_group in df_event['trt'].unique():
    mask=df_event['trt'] == trt_group
    kmf.fit(df_event[mask]['TIME'],df_event[mask]['Y'],label=str(trt_group))
    print(f"Median survival for treatment {trt_group}: {kmf.median_survival_time_}")

In [None]:
#log-Rank Test Conducted
from lifelines.statistics import logrank_test

group1=df_event[df_event['trt']=='standard']
group2=df_event[df_event['trt']=='test']

results=logrank_test(group1['TIME'],group2['TIME'], event_observed_A=group1['Y'], event_observed_B=group2['Y'])
print(results.summary)

In [None]:
from lifelines import CoxPHFitter

#Prepare wide-format data: one row per subject
df_wide=df_event.copy()

#Encode categorical variables
df_wide=pd.get_dummies(df_wide,columns=['trt','celltype','priortherapy'],drop_first=True)

cph=CoxPHFitter()
cph.fit(df_wide,duration_col='TIME', event_col='Y')
cph.print_summary()
cph.plot()
plt.show()

In [None]:
#Step 1: Seperate rows by Y
df_start=df[df["Y"]==0].copy() #Start of observation/censoring
df_event=df[df["Y"]==1].copy() #Death

#Step 2: Merge on ID
#Bring over TIME as stop and Y as event

df_event=df_event.rename(columns={"TIME":"stop","Y":"event"})
df_start=df_start.rename(columns={"TIME":"start"})

df_timevarying=pd.merge(
    df_start,
    df_event[["ID","stop","event"]],
    on="ID"
)

#Step 3:One hot encode categorical variables
df_timevarying=pd.get_dummies(df_timevarying,columns=["trt","celltype","priortherapy"], drop_first=True)

#Final check
print(df_timevarying[['ID','start','stop','event']].head())

In [None]:
import numpy as np

#Check for NaNs
print("Any NaNs?", df_timevarying.isnull().values.any())

#Check for infinite values
print("Any Infs?", np.isinf(df_timevarying.select_dtypes(include=[np.number])).values.any())

In [None]:
print(df_timevarying[df_timevarying.isnull().any(axis=1)])
print(df_timevarying[np.isinf(df_timevarying.select_dtypes(include=[np.number])).any(axis=1)])

In [None]:
#Ensure 'Y' is not in the modeling data
if 'Y' in df_timevarying.columns:
    df_timevarying.drop(columns=['Y'],inplace=True)

In [None]:
print(df_timevarying.columns)

In [None]:
from lifelines import CoxTimeVaryingFitter

#Initialize the Cox time varying model
ctv=CoxTimeVaryingFitter()

#Fit the model using the transformed dataframe
ctv.fit(
    df_timevarying,
    id_col='ID',
    start_col='start',
    stop_col='stop',
    event_col='event' #This reflect the renamed column
)

#Print model summary
ctv.print_summary()

#Plot coefficients
ctv.plot()
plt.title("Cox Time-Varying Coefficients")
plt.show()

In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf=KaplanMeierFitter()
plt.figure(figsize=(10,6))
for group in df['trt'].unique():
    label=str(group)
    mask=df['trt']==group
    kmf.fit(df[mask]['TIME'], event_observed=df[mask]['Y'],label=label)
    kmf.plot_survival_function()

plt.title('Survival by Treatment Group')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.legend(title="Treatment")
plt.show()

In [None]:
print(df['trt'].value_counts())

In [None]:
from lifelines.statistics import logrank_test

#Assuming 'trt' column has two treatment types (e.g, 1=Standard, 2=Test)
group1=df[df['trt']=='standard']
group2=df[df['trt']=='test']

#Perform the log-rank test using correct event column:'Y'
results=logrank_test(
    group1['TIME'],group2['TIME'],
    event_observed_A=group1['Y'],
    event_observed_B=group2['Y']
)

#Output
print(f"Log-Rank Test p-value: {results.p_value:.4f}")
if results.p_value<0.05:
    print("-> Significant difference in survival.")
else:
    print("-> No statistically significant difference.")

In [None]:
import pandas as pd
from lifelines import CoxPHFitter

#Encode categorical variables
df_encoded=pd.get_dummies(df,columns=['celltype','trt','priortherapy'],drop_first=True)

#Collect celltype dummy variables
celltype_dummies=[col for col in df_encoded.columns if col.startswith('celltype_')]
treatment_dummies=[col for col in df_encoded.columns if col.startswith('treatment_')]
prior_dummies=[col for col in df_encoded.columns if col.startswith('prior_')]

#Define final feature set
features=['TIME','Y','age','karno','diagtime'] +celltype_dummies +treatment_dummies+prior_dummies

#Prepare dataset
cox_df=df_encoded[features]

#Fit Cox Proportional Hazard Model
cph=CoxPHFitter()
cph.fit(cox_df,duration_col='TIME',event_col='Y')
cph.print_summary()