In [2]:
import pandas as pd
import numpy as np

rs = 42

In [3]:
mean_vars = ['LATITUDE','LONGITUDE','AGE','YEARS_WITH_COMPANY','ANNUAL_KILOMETERS','DAILY_KILOMETERS','YEARS_LICENSED', 'YEARS_WITH_PRIOR_COMPANY','VEHICLE_AGE']
years_since = {'YEARS_SINCE_AT_FAULT_CLAIM': 6, 'YEARS_SINCE_MAJOR_CONVICTION': 6, 'YEARS_SINCE_NOT_AT_FAULT_CLAIM': 6, 'YEARS_SINCE_MINOR_CONVICTION': 3, 'YEARS_SINCE_MAJOR_CONVICTION': 3, 'YEARS_SINCE_SERIOUS_CONVICTION': 3}

In [19]:
def data_pipeline():
    df = pd.read_csv('large.csv')
    df = df.sample(frac=1, random_state = rs)
    df = df[df['INCURRED_LOSS_DCPD'] > 0]
    df = df.drop(['INCURRED_LOSS_COLLISION','INCURRED_LOSS_COMPREHENSIVE'], axis=1)
    
    df.GENDER = df.GENDER.apply(lambda x: 1 if x == 'M' else 0)
    df.DRIVER_MARTIAL_STATUS = df.DRIVER_MARTIAL_STATUS.apply(lambda x: 1 if x == 'S' else 0)
    df.DRIVER_TRAINING_IND = df.DRIVER_TRAINING_IND.apply(lambda x: 1 if x == 'Y' else 0)
    df.VEHICLE_AGE = df.VEHICLE_AGE.apply(lambda x: max(x,0))
    
    y = df['INCURRED_LOSS_DCPD']
    X = df.drop(labels='INCURRED_LOSS_DCPD',axis=1)
    
    return X, y

def linear_model_data():
    X, y = data_pipeline()
    for i in X.columns:
        if i in mean_vars:
            X[i] = X[i].fillna(X[i].mean())
        elif i in years_since:
            X[i] = X[i].fillna(years_since[i])
        else:
            X[i] = X[i].fillna(X[i].mode()[0])
            
    X = pd.get_dummies(X, columns = ['PAYMENT_METHOD', 'PRIOR_COMPANY'], drop_first=True)
        
    return X, y

In [20]:
from pickle import load

with open('xgb.pkl','rb') as f:
    xgb = load(f)
    
with open('ridge.pkl','rb') as f:
    linear = load(f)
    
class Baseline:
    
    def __init__(self):
        self.pred = None
    
    def fit(self, X_train, y_train):
        self.pred = y_train.mean()
        
    def predict(self, X_test):
        return [self.pred]*len(X_test)
    
base = Baseline()
    





In [21]:
from sklearn.model_selection import train_test_split

X, y = data_pipeline()

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [22]:
from sklearn.metrics import mean_squared_error
from math import sqrt

xgb_pred = xgb.predict(X_test)

xgb_rmse = sqrt(mean_squared_error(xgb_pred, y_test))

xgb_rmse

2426.5505144234485

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt

base.fit(X_train, y_train)

base_pred = base.predict(X_test)

base_rmse = sqrt(mean_squared_error(base_pred, y_test))

base_rmse

2688.0711807907887

In [24]:
from sklearn.model_selection import train_test_split

X, y = linear_model_data()

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt,exp

vf = np.vectorize(exp)

linear_pred = vf(linear.predict(X_test))

linear_rmse = sqrt(mean_squared_error(linear_pred, y_test))

linear_rmse

2549.78247393673

In [28]:
d = {'True Value': y_test, 'XGB Prediction': xgb_pred, 'Linear Prediction': linear_pred, 'Baseline Prediction': base_pred}
df = pd.DataFrame(data = d)
df

Unnamed: 0,True Value,XGB Prediction,Linear Prediction,Baseline Prediction
310471,18419,16928.933594,15990.637484,17225.129484
119461,8899,17252.109375,17215.192840,17225.129484
273721,15745,15832.087891,15613.789241,17225.129484
468757,19022,15891.789062,15988.608777,17225.129484
100938,17997,17995.257812,18479.197349,17225.129484
...,...,...,...,...
297834,20029,18270.841797,17081.014858,17225.129484
331404,18954,17905.271484,17588.110775,17225.129484
293181,17908,17066.800781,17626.118155,17225.129484
295873,17966,17772.039062,18246.128016,17225.129484


In [31]:
df = df.sort_values(by='True Value')
df

Unnamed: 0,True Value,XGB Prediction,Linear Prediction,Baseline Prediction
196795,6587,16295.699219,17212.227241,17225.129484
363362,7174,15436.832031,16344.227270,17225.129484
221822,7352,16377.806641,16286.998215,17225.129484
13116,7718,16074.304688,16076.240414,17225.129484
63527,7961,14763.991211,15273.773188,17225.129484
...,...,...,...,...
40391,24027,21116.320312,18325.932394,17225.129484
479796,24363,17905.769531,17004.664829,17225.129484
391186,24394,22320.746094,18758.642713,17225.129484
196760,24642,21757.738281,18505.794914,17225.129484


In [32]:
import plotly.express as px

fig = px.line(df, x="True Value", y="XGB Prediction")
fig.show()

In [33]:
import plotly.express as px

fig = px.line(df, x="True Value", y="Linear Prediction")
fig.show()

In [34]:
import plotly.express as px

fig = px.line(df, x="True Value", y="Baseline Prediction")
fig.show()