In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter


## Data Preprocessing    

In [26]:
# get data from s3
url = "https://s3.amazonaws.com/survival2024/hurricane.csv"
df = pd.read_csv(url)

In [27]:
#check for motor failure
df['motor']=0
df.loc[df['reason'] == 2, 'motor'] = 1

#grab hour columns but first rename hour column
df = df.rename(columns={'hour': 'failure_hour'})
hour_columns = [col for col in df.columns if col.startswith('h')]

#fill na's in the hour_columns with 0's, I'm assuming that since they are missing they are not turned on
df[hour_columns] = df[hour_columns].fillna(0)

#add pump numbers to df
df = df.reset_index(names='pump')

#melt df to reshape it
df_long = df.melt(
    id_vars=[col for col in df.columns if col not in hour_columns],  # Static columns to keep
    value_vars=hour_columns,  # Columns to unpivot
    var_name='run_hour_stop',  # New column name for hour labels
    value_name='running'  # Values from the hour columns
)

#grab numeric hour
df_long['run_hour_stop'] = df_long['run_hour_stop'].str.extract(r'(\d+)').astype(float).fillna(-1).astype(int)

#create a continous runtime column. how many hours has runnning=0 in a row
df_long['continuous_runtime'] = (
    df_long.groupby('pump')['running']
    .cumsum() * df_long['running']
)

df_long = df_long.sort_values(['pump', 'run_hour_stop']).reset_index(drop=True)
df_long.head()


Unnamed: 0,pump,backup,age,bridgecrane,servo,gear,trashrack,slope,elevation,survive,failure_hour,reason,reason2,motor,run_hour_stop,running,continuous_runtime
0,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,1,1.0,1.0
1,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,2,1.0,2.0
2,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,3,1.0,3.0
3,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,4,1.0,4.0
4,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,5,1.0,5.0


Now I'm going to grab only the columns in which I need to predict

In [28]:
df_motor = df_long[['pump', 'backup', 'age', 'bridgecrane', 'servo', 'gear', 'trashrack', 'slope', 'elevation', 'failure_hour', 'motor', 'run_hour_stop', 'running', 'continuous_runtime']]
df_motor = df_motor.copy()
df_motor['12+'] = (df_motor['continuous_runtime'] >= 12).astype(int)
df_motor['run_hour_start'] = df_motor['run_hour_stop']-1
df_motor.head()

Unnamed: 0,pump,backup,age,bridgecrane,servo,gear,trashrack,slope,elevation,failure_hour,motor,run_hour_stop,running,continuous_runtime,12+,run_hour_start
0,0,0,6.0,0,0,0,1,3,2,48,0,1,1.0,1.0,0,0
1,0,0,6.0,0,0,0,1,3,2,48,0,2,1.0,2.0,0,1
2,0,0,6.0,0,0,0,1,3,2,48,0,3,1.0,3.0,0,2
3,0,0,6.0,0,0,0,1,3,2,48,0,4,1.0,4.0,0,3
4,0,0,6.0,0,0,0,1,3,2,48,0,5,1.0,5.0,0,4


In [31]:
print(df_motor.columns)


Index(['pump', 'backup', 'age', 'bridgecrane', 'servo', 'gear', 'trashrack',
       'slope', 'elevation', 'failure_hour', 'motor', 'run_hour_stop',
       'running', 'continuous_runtime', '12+', 'run_hour_start'],
      dtype='object')


## Cox regression

In [36]:
cph = CoxPHFitter()
# cph.fit(df_motor.loc[:, ~df_motor.columns.isin(['run_hour', 'running', 'continuous_runtime'])], duration_col=['run_hour_start','run_hour_stop'], event_col='motor')
# cph.fit(df_motor,duration_col=['run_hour_start', 'run_hour_stop'], event_col='motor')
cph.fit(df_motor.loc[:, ~df_motor.columns.isin(['run_hour', 'running', 'continuous_runtime', 'run_hour_start', 'pump'])],duration_col='run_hour_stop', event_col='motor')
cph.print_summary()



>>> events = df['motor'].astype(bool)
>>> print(df.loc[events, 'trashrack'].var())
>>> print(df.loc[~events, 'trashrack'].var())

A very low variance means that the column trashrack completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.




0,1
model,lifelines.CoxPHFitter
duration col,'run_hour_stop'
event col,'motor'
baseline estimation,breslow
number of observations,36960
number of events observed,5376
partial log-likelihood,-45093.91
time fit was run,2024-11-20 03:31:26 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
backup,0.02,1.02,0.03,-0.04,0.07,0.96,1.07,0.0,0.55,0.58,0.79
age,-0.81,0.45,0.02,-0.84,-0.77,0.43,0.46,0.0,-42.78,<0.005,inf
bridgecrane,-0.11,0.89,0.04,-0.2,-0.03,0.82,0.97,0.0,-2.73,0.01,7.32
servo,0.32,1.37,0.03,0.26,0.38,1.3,1.46,0.0,10.47,<0.005,82.85
gear,0.27,1.32,0.05,0.18,0.37,1.19,1.45,0.0,5.58,<0.005,25.3
trashrack,-19.0,0.0,196.09,-403.33,365.33,0.0,4.56e+158,0.0,-0.1,0.92,0.12
slope,-0.31,0.73,0.01,-0.33,-0.29,0.72,0.75,0.0,-33.49,<0.005,814.53
elevation,0.09,1.1,0.02,0.06,0.13,1.06,1.14,0.0,5.03,<0.005,20.95
failure_hour,0.01,1.01,0.0,0.01,0.02,1.01,1.02,0.0,10.57,<0.005,84.39
12+,-0.32,0.73,0.03,-0.39,-0.25,0.68,0.78,0.0,-9.5,<0.005,68.68

0,1
Concordance,0.89
Partial AIC,90207.81
log-likelihood ratio test,12711.81 on 10 df
-log2(p) of ll-ratio test,inf
