In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter


## Data Preprocessing    

In [56]:
# get data from s3
url = "https://s3.amazonaws.com/survival2024/hurricane.csv"
df = pd.read_csv(url)

In [57]:
#check for motor failure
df['motor']=0
df.loc[df['reason'] == 2, 'motor'] = 1

#grab hour columns but first rename hour column
df = df.rename(columns={'hour': 'failure_hour'})
hour_columns = [col for col in df.columns if col.startswith('h')]

#fill na's in the hour_columns with 0's, I'm assuming that since they are missing they are not turned on
df[hour_columns] = df[hour_columns].fillna(0)

#add pump numbers to df
df = df.reset_index(names='pump')

#melt df to reshape it
df_long = df.melt(
    id_vars=[col for col in df.columns if col not in hour_columns],  # Static columns to keep
    value_vars=hour_columns,  # Columns to unpivot
    var_name='run_hour',  # New column name for hour labels
    value_name='running'  # Values from the hour columns
)

#grab numeric hour
df_long['run_hour'] = df_long['run_hour'].str.extract(r'(\d+)').astype(float).fillna(-1).astype(int)

#create a continous runtime column. how many hours has runnning=0 in a row
df_long['continuous_runtime'] = (
    df_long.groupby('pump')['running']
    .cumsum() * df_long['running']
)

df_long = df_long.sort_values(['pump', 'run_hour']).reset_index(drop=True)
df_long.head()


Unnamed: 0,pump,backup,age,bridgecrane,servo,gear,trashrack,slope,elevation,survive,failure_hour,reason,reason2,motor,run_hour,running,continuous_runtime
0,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,1,1.0,1.0
1,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,2,1.0,2.0
2,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,3,1.0,3.0
3,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,4,1.0,4.0
4,0,0,6.0,0,0,0,1,3,2,1,48,0,5,0,5,1.0,5.0


Now I'm going to grab only the columns in which I need to predict

In [58]:
df_motor = df_long[['pump', 'backup', 'age', 'bridgecrane', 'servo', 'gear', 'trashrack', 'slope', 'elevation', 'failure_hour', 'motor', 'run_hour', 'running', 'continuous_runtime']]
df_motor.head()

Unnamed: 0,pump,backup,age,bridgecrane,servo,gear,trashrack,slope,elevation,failure_hour,motor,run_hour,running,continuous_runtime
0,0,0,6.0,0,0,0,1,3,2,48,0,1,1.0,1.0
1,0,0,6.0,0,0,0,1,3,2,48,0,2,1.0,2.0
2,0,0,6.0,0,0,0,1,3,2,48,0,3,1.0,3.0
3,0,0,6.0,0,0,0,1,3,2,48,0,4,1.0,4.0
4,0,0,6.0,0,0,0,1,3,2,48,0,5,1.0,5.0


## Cox regression

In [59]:
cph = CoxPHFitter()
cph.fit(df_motor, duration_col='failure_hour', event_col='motor')
cph.print_summary()


>>> events = df['motor'].astype(bool)
>>> print(df.loc[events, 'trashrack'].var())
>>> print(df.loc[~events, 'trashrack'].var())

A very low variance means that the column trashrack completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.




0,1
model,lifelines.CoxPHFitter
duration col,'failure_hour'
event col,'motor'
baseline estimation,breslow
number of observations,36960
number of events observed,5376
partial log-likelihood,-43091.32
time fit was run,2024-11-18 00:49:15 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
pump,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,60.62,<0.005,inf
backup,-0.09,0.91,0.03,-0.15,-0.04,0.86,0.96,0.0,-3.22,<0.005,9.59
age,-1.06,0.35,0.02,-1.1,-1.03,0.33,0.36,0.0,-57.15,<0.005,inf
bridgecrane,-0.45,0.64,0.04,-0.53,-0.37,0.59,0.69,0.0,-10.63,<0.005,85.22
servo,0.23,1.25,0.03,0.16,0.29,1.18,1.34,0.0,6.95,<0.005,37.99
gear,-0.12,0.88,0.05,-0.22,-0.03,0.8,0.97,0.0,-2.49,0.01,6.3
trashrack,-18.67,0.0,256.17,-520.75,483.41,0.0,8.75e+209,0.0,-0.07,0.94,0.09
slope,-0.25,0.78,0.01,-0.27,-0.23,0.77,0.8,0.0,-23.29,<0.005,396.11
elevation,0.12,1.12,0.02,0.08,0.15,1.09,1.16,0.0,6.55,<0.005,34.0
run_hour,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,1.55,0.12,3.04

0,1
Concordance,0.94
Partial AIC,86206.64
log-likelihood ratio test,20736.73 on 12 df
-log2(p) of ll-ratio test,inf
