In [1]:
import pandas as pd
import numpy as np

rs = 42

In [2]:
def data_pipeline():
    df = pd.read_csv('large.csv')
    df = df.sample(frac=1, random_state = rs)
    df = df[df['INCURRED_LOSS_DCPD'] > 0]
    df = df.drop(['INCURRED_LOSS_COLLISION','INCURRED_LOSS_COMPREHENSIVE'], axis=1)
    
    df.GENDER = df.GENDER.apply(lambda x: 1 if x == 'M' else 0)
    df.DRIVER_MARTIAL_STATUS = df.DRIVER_MARTIAL_STATUS.apply(lambda x: 1 if x == 'S' else 0)
    df.DRIVER_TRAINING_IND = df.DRIVER_TRAINING_IND.apply(lambda x: 1 if x == 'Y' else 0)
    df.VEHICLE_AGE = df.VEHICLE_AGE.apply(lambda x: max(x,0))
    
    y = df['INCURRED_LOSS_DCPD']
    X = df.drop(labels='INCURRED_LOSS_DCPD',axis=1)
    
    return X, y

In [3]:
X, y = data_pipeline()

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

params = {
        'max_depth': [2, 3, 5, 7, 9, 11],
        'n_estimators': [1000],
        'colsample_bytree': [0.6, 0.8, 1], 
        'subsample': [0.6, 0.8, 1], 
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
        'random_state': [rs]
    }

xgb = GridSearchCV(XGBRegressor(), cv=5, param_grid=params, verbose=2, n_jobs=-1)


xgb.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)



Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 52.4min
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed: 58.8min finished
  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:16567.2	validation_1-rmse:16555.3
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:15758.7	validation_1-rmse:15746.6
[2]	validation_0-rmse:14990.8	validation_1-rmse:14979.5
[3]	validation_0-rmse:14263.2	validation_1-rmse:14251.6
[4]	validation_0-rmse:13570.9	validation_1-rmse:13560
[5]	validation_0-rmse:12915.4	validation_1-rmse:12905.4
[6]	validation_0-rmse:12292.6	validation_1-rmse:12283.3
[7]	validation_0-rmse:11703.2	validation_1-rmse:11694.7
[8]	validation_0-rmse:11143.4	validation_1-rmse:11135.6
[9]	validation_0-rmse:10612.8	validation_1-rmse:10605.6
[10]	validation_0-rmse:10111.5	validation_1-rmse:10104.4
[11]	validation_0-rmse:9635.1	validation_1-rmse:9628.74
[12]	validation_0-rmse:9185.46	validation_1-rmse:9179.49
[13]	validation_0-rmse:8758.46	validation_1-rmse:8752.82
[14]	validation_0-rmse:8355.62	validation_1-rmse:8351.29

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [2, 3, 5, 7, 9, 11], 'n_estimators': [1000], 'colsample_bytree': [0.6, 0.8, 1], 'subsample': [0.6, 0.8, 1], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [13]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_pred = xgb.best_estimator_.predict(X_test)

rmse = sqrt(mean_squared_error(y_pred,y_test))

rmse

2426.5505144234485

In [9]:
from pickle import dump

with open('xgb.pkl','wb') as f:
    dump(xgb.best_estimator_,f)

In [3]:
X, y = data_pipeline()

In [4]:
for i in X:
    print(i)

LATITUDE
LONGITUDE
AGE
YEARS_WITH_COMPANY
ANNUAL_KILOMETERS
DAILY_KILOMETERS
YEARS_LICENSED
VEHICLE_YEAR
NUMBER_OF_DRIVERS
NUMBER_OF_VEHICLES
PRIOR_COMPANY
DEDUCTIBLE_COLLISION
DEDUCTIBLE_COMPREHENSIVE
DEDUCTIBLE_DCPD
PAYMENT_METHOD
AT_FAULT_CLAIMS
NOT_AT_FAULT_CLAIMS
MINOR_CONVICTIONS
MAJOR_CONVICTIONS
SERIOUS_CONVICTIONS
DRIVING_EXPERIENCE
YEARS_WITH_PRIOR_COMPANY
GENDER
DRIVER_MARTIAL_STATUS
DRIVER_TRAINING_IND
VEHICLE_AGE
YEARS_SINCE_AT_FAULT_CLAIM
YEARS_SINCE_NOT_AT_FAULT_CLAIM
YEARS_SINCE_MINOR_CONVICTION
YEARS_SINCE_MAJOR_CONVICTION
YEARS_SINCE_SERIOUS_CONVICTION


In [5]:
mean_vars = ['LATITUDE','LONGITUDE','AGE','YEARS_WITH_COMPANY','ANNUAL_KILOMETERS','DAILY_KILOMETERS','YEARS_LICENSED', 'YEARS_WITH_PRIOR_COMPANY','VEHICLE_AGE']
years_since = {'YEARS_SINCE_AT_FAULT_CLAIM': 6, 'YEARS_SINCE_MAJOR_CONVICTION': 6, 'YEARS_SINCE_NOT_AT_FAULT_CLAIM': 6, 'YEARS_SINCE_MINOR_CONVICTION': 3, 'YEARS_SINCE_MAJOR_CONVICTION': 3, 'YEARS_SINCE_SERIOUS_CONVICTION': 3}

In [6]:
for i in X.columns:
    if i in mean_vars:
        X[i] = X[i].fillna(X[i].mean())
    elif i in years_since:
        X[i] = X[i].fillna(years_since[i])
    else:
        X[i] = X[i].fillna(X[i].mode()[0])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [48]:
X_train.isna().sum()

LATITUDE                          0
LONGITUDE                         0
AGE                               0
YEARS_WITH_COMPANY                0
ANNUAL_KILOMETERS                 0
DAILY_KILOMETERS                  0
YEARS_LICENSED                    0
VEHICLE_YEAR                      0
NUMBER_OF_DRIVERS                 0
NUMBER_OF_VEHICLES                0
PRIOR_COMPANY                     0
DEDUCTIBLE_COLLISION              0
DEDUCTIBLE_COMPREHENSIVE          0
DEDUCTIBLE_DCPD                   0
PAYMENT_METHOD                    0
AT_FAULT_CLAIMS                   0
NOT_AT_FAULT_CLAIMS               0
MINOR_CONVICTIONS                 0
MAJOR_CONVICTIONS                 0
SERIOUS_CONVICTIONS               0
DRIVING_EXPERIENCE                0
YEARS_WITH_PRIOR_COMPANY          0
GENDER                            0
DRIVER_MARTIAL_STATUS             0
DRIVER_TRAINING_IND               0
VEHICLE_AGE                       0
YEARS_SINCE_AT_FAULT_CLAIM        0
YEARS_SINCE_NOT_AT_FAULT_CLA

In [55]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(normalize=True)  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [56]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_pred = linear_regressor.predict(X_test)

rmse = sqrt(mean_squared_error(y_pred,y_test))

rmse

2549.7012458949234

In [8]:
X_2 = pd.get_dummies(X, columns = ['PAYMENT_METHOD', 'PRIOR_COMPANY'], drop_first=True)

In [65]:
df.PRIOR_COMPANY.isna().sum()/len(df)

0.012520413718018509

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_2,y, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [67]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(normalize=True)  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [68]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_pred = linear_regressor.predict(X_test)

rmse = sqrt(mean_squared_error(y_pred,y_test))

rmse

2547.8214988051186

In [9]:
y_2 = y.apply(np.log)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_2,y_2, test_size=0.2, random_state = rs)

X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state = rs)

In [72]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(normalize=True)  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [78]:
from sklearn.metrics import mean_squared_error
from math import sqrt,exp
vf = np.vectorize(exp)
y_pred = linear_regressor.predict(X_test)

rmse = sqrt(mean_squared_error(vf(y_pred),y_test.apply(exp)))

rmse

2549.583873760383

In [14]:
from sklearn.linear_model import Ridge
#Alpha = 0 gives 2489.137215040446
linear_regressor = Ridge(alpha=0.01, normalize=True)  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [15]:
from sklearn.metrics import mean_squared_error
from math import sqrt,exp
vf = np.vectorize(exp)
y_pred = linear_regressor.predict(X_val)

rmse = sqrt(mean_squared_error(vf(y_pred),y_val.apply(exp)))

rmse

2488.925770036743

In [None]:
linear_regressor.score(X_val,y_val)

In [None]:
np.sum(linear_regressor.coef_!=0)


In [16]:
from pickle import dump

with open('ridge.pkl','wb') as f:
    dump(linear_regressor,f)