In [2]:
# Importing Packages
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_selection import VarianceThreshold
from lifelines import CoxPHFitter
from lifelines.datasets import load_rossi
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import math

In [3]:
train_data = pd.read_csv('data/car_breakdown_train.tsv', sep='\t', header=0)
train_data.head()

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100,39.06,23.419
1,1,2,0.0019,-0.0003,100,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100,39.0,23.4236
2,1,3,-0.0043,0.0003,100,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100,38.95,23.3442
3,1,4,0.0007,0.0,100,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100,38.88,23.3739
4,1,5,-0.0019,-0.0002,100,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100,38.9,23.4044


In [4]:
# Creating breakdown event
train_data['breakdown'] = 0

# Aggregating data to weekly level to increase proportion of failure
train_data['week'] = np.ceil(train_data['days']/7)

train_data_agg = train_data.groupby(['vehicleId','week'],as_index=False).mean()
print(train_data_agg.shape)
train_data_agg.head()

(2984, 28)


Unnamed: 0,vehicleId,week,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,breakdown
0,1,1.0,4.0,-0.001086,-8.6e-05,100.0,518.67,642.231429,1587.42,1401.738571,...,2388.042857,8133.734286,8.4107,0.03,391.714286,2388.0,100.0,38.981429,23.387057,0.0
1,1,2.0,11.0,-0.0005,0.000114,100.0,518.67,642.307143,1586.497143,1399.572857,...,2388.035714,8131.928571,8.406914,0.03,392.142857,2388.0,100.0,39.011429,23.384229,0.0
2,1,3.0,18.0,-0.000829,2.9e-05,100.0,518.67,642.422857,1586.077143,1400.915714,...,2388.055714,8132.361429,8.418357,0.03,391.714286,2388.0,100.0,38.94,23.373443,0.0
3,1,4.0,25.0,0.000186,-1.4e-05,100.0,518.67,642.43,1589.777143,1398.651429,...,2388.057143,8131.692857,8.410157,0.03,392.285714,2388.0,100.0,38.952857,23.397143,0.0
4,1,5.0,32.0,-0.000214,-2.9e-05,100.0,518.67,642.307143,1587.682857,1400.034286,...,2388.052857,8132.442857,8.414829,0.03,391.714286,2388.0,100.0,38.958571,23.383157,0.0


In [5]:
# Modifying breakdown event to better fit of the model
uni_veh_id = train_data_agg.vehicleId.unique()

train_data_new = pd.DataFrame()
for i in uni_veh_id :
    td = train_data_agg[train_data_agg['vehicleId'] == i]
    td = td.reset_index(drop=True)
    td = td.sort_values(['week'],axis=0, ascending=True)
    td['breakdown'] = 1.0/(td.shape[0]-td.index)
    train_data_new = train_data_new.append(td, ignore_index=True)    


In [6]:
train_data_wo_veh = train_data_new.drop(columns = ['vehicleId','days'])
print(list(train_data_wo_veh.shape))
print(list(train_data_wo_veh.columns))

[2984, 26]
['week', 'ecoMode', 'cityMode', 'sportMode', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21', 'breakdown']


In [7]:
# Removing variables with low variances
train_data_2 = train_data_wo_veh[train_data_wo_veh.columns[train_data_wo_veh.var().round(10) > 0]] 
print(list(train_data_2.columns))
print(train_data_2.shape)
print(train_data_2.head())

['week', 'ecoMode', 'cityMode', 's2', 's3', 's4', 's6', 's7', 's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21', 'breakdown']
(2984, 19)
   week   ecoMode  cityMode          s2           s3           s4     s6  \
0   1.0 -0.001086 -0.000086  642.231429  1587.420000  1401.738571  21.61   
1   2.0 -0.000500  0.000114  642.307143  1586.497143  1399.572857  21.61   
2   3.0 -0.000829  0.000029  642.422857  1586.077143  1400.915714  21.61   
3   4.0  0.000186 -0.000014  642.430000  1589.777143  1398.651429  21.61   
4   5.0 -0.000214 -0.000029  642.307143  1587.682857  1400.034286  21.61   

           s7           s8           s9        s11         s12          s13  \
0  554.261429  2388.055714  9050.948571  47.308571  522.201429  2388.042857   
1  554.015714  2388.064286  9047.447143  47.244286  521.824286  2388.035714   
2  554.092857  2388.067143  9050.581429  47.234286  522.002857  2388.055714   
3  553.994286  2388.062857  9050.045714  47.332857  522.151429  2388.0571

In [8]:
# Final selection of variables after few iteration where all p values are <0.05
train_data_3 = train_data_2[['week', 'ecoMode', 'cityMode', 's4', 's6', 's8', 's9', 's11', 's12', 's13', 's20', 's21', 'breakdown']]


cph = CoxPHFitter()   
cph.fit(train_data_3, duration_col='week', event_col='breakdown',show_progress = True,step_size = 0.5)   
cph.print_summary() 

Iteration 1: norm_delta = 0.75060, step_size = 0.5000, ll = -20895.96304, newton_decrement = 874.62297, seconds_since_start = 0.0Iteration 2: norm_delta = 0.44584, step_size = 0.5000, ll = -20239.49412, newton_decrement = 280.60778, seconds_since_start = 0.0Iteration 3: norm_delta = 0.27875, step_size = 0.5000, ll = -20025.28685, newton_decrement = 98.12565, seconds_since_start = 0.0Iteration 4: norm_delta = 0.16301, step_size = 0.6000, ll = -19950.66857, newton_decrement = 31.69929, seconds_since_start = 0.0Iteration 5: norm_delta = 0.07409, step_size = 0.7200, ll = -19923.65686, newton_decrement = 6.54188, seconds_since_start = 0.0Iteration 6: norm_delta = 0.02288, step_size = 0.8640, ll = -19917.55493, newton_decrement = 0.64519, seconds_since_start = 0.0Iteration 7: norm_delta = 0.00341, step_size = 1.0000, ll = -19916.91711, newton_decrement = 0.01466, seconds_since_start = 0.1Iteration 8: norm_delta = 0.00002, step_size = 1.0000, ll = -19916.90242, newton_decrement = 0.00



In [9]:
# Aggregating test data at weekly level
test_data = pd.read_csv('data/car_breakdown_test.tsv', sep='\t', header=0)
test_data['week'] = np.ceil(test_data['days']/7)
print(test_data.shape)

test_data = test_data.groupby(['vehicleId','week'],as_index=False).mean()
print(test_data.shape)
test_data.head()

(13096, 27)
(1911, 27)


Unnamed: 0,vehicleId,week,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1.0,4.0,0.000957,8.6e-05,100.0,518.67,642.337143,1584.921429,1400.468571,...,521.901429,2388.048571,8131.448571,8.405657,0.03,391.714286,2388.0,100.0,38.958571,23.387186
1,1,2.0,11.0,-0.000871,-4.3e-05,100.0,518.67,642.177143,1585.734286,1402.897143,...,522.057143,2388.054286,8131.47,8.414586,0.03,391.571429,2388.0,100.0,39.042857,23.389543
2,1,3.0,18.0,0.001514,1.4e-05,100.0,518.67,642.477143,1585.03,1402.365714,...,522.022857,2388.05,8130.315714,8.421829,0.03,391.285714,2388.0,100.0,38.997143,23.374357
3,1,4.0,25.0,0.0015,-2.9e-05,100.0,518.67,642.232857,1587.312857,1401.314286,...,521.905714,2388.065714,8130.065714,8.423814,0.03,392.285714,2388.0,100.0,38.915714,23.367171
4,1,5.0,30.0,-0.000567,0.0003,100.0,518.67,642.44,1584.696667,1399.33,...,521.986667,2388.076667,8132.676667,8.411533,0.03,392.0,2388.0,100.0,38.973333,23.374733


In [10]:
# Predicting for test data
test_uni_veh_id = test_data.vehicleId.unique()
test_predict = pd.DataFrame()
for i in test_uni_veh_id :
    test_data_1 = test_data[test_data['vehicleId'] == i]
    predict = cph.predict_survival_function(test_data_1,times=list(range(1,52)))
#     failure = len(predict.index[predict.iloc[:,-1] > 0.1]) - predict.shape[1]
    failure = len(predict.index[predict.mean(axis=1) > 0.1]) - predict.shape[1]
    if failure<0:failure = 0
    failure = failure*7
    test_predict = test_predict.append({'vehicleId' : i,'RUL' : failure}, ignore_index=True)


test_predict.to_csv("test_predict.csv")
