In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

from numpy import loadtxt
from sklearn import cluster, datasets, metrics ,preprocessing
from sklearn.metrics import mean_absolute_error,accuracy_score,confusion_matrix
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from IPython.display import clear_output

def train_model(train_x,train_y,test_x,Re,num_class=3):
    params={'booster':'gbtree',
        'objective': 'multi:softmax', 
        'num_class':num_class,
        'gamma':0.05,
        'max_depth':7,
        'subsample':0.5,
        'colsample_bytree':0.6,
        'silent':1 ,
        'eta': 0.07,
        'seed':1000,
        'nthread':4
       }
    plst = list(params.items())
    num_rounds = 200
    offset = int(len(train_x)*0.8)
    train_x = train_x.reset_index(drop=True)
    train_y = train_y.reset_index(drop=True)
    xgtrain = xgb.DMatrix(train_x.loc[:offset,:], label=train_y[:offset+1])
    xgval = xgb.DMatrix(train_x.loc[offset:,:], label=train_y[offset:])
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist,early_stopping_rounds=40)

    test_x = test_x.reset_index(drop=True)
    Re = Re.reset_index(drop=True)
    pred1 = model.predict(xgb.DMatrix(Re),ntree_limit=model.best_iteration)
    pred2 = model.predict(xgb.DMatrix(test_x),ntree_limit=model.best_iteration)
    clear_output()
    return pred1,pred2

rgb_model = XGBRegressor(
                            learning_rate = 0.1,
                            n_estimators = 200,
                            max_depth= 7,
                            objective = 'reg:linear',
                            nthread = 4,
                            seed=1000
                         )

def modelfit(train_x,train_y2,test_x,Mask1):
    rgb_model.fit(train_x[Mask1],train_y2[Mask1])
    T1,T2 = rgb_model.predict(train_x),rgb_model.predict(test_x)
    T1[T1<0]=0
    T2[T2<0]=0
    T3 = train_x['Premium_sum']*T1
    T4 = test_x['Premium_sum']*T2
    return T3,T4

def and2(A,B):
    C=A.copy()
    C[C]=B
    return C
def div1(Y,N1,N2):
    Z = Y.copy()
    A = Y.copy()
    Z[A>=N1] = 0
    Z[np.logical_and(A<N1,A>N2)] = 1
    Z[A<=N2] = 2
    return Z
def div2(Y,N):
    Z = Y.copy()
    A = Y.copy()
    L = len(N)+1
    Z[A>=N[0]] = 0
    for i in range(1,L-2):
        Z[np.logical_and(A<N[i-1],A>=N[i])] = i
    Z[np.logical_and(A<N[-2],A>N[-1])] = L-2
    Z[A<=N[-1]] = L-1
    return Z

In [2]:
train = pd.read_csv('../data/training_policy_c2.csv')
T = pd.read_csv('../data/duplicate_policy.csv',header=None)
T = T[0]
TF = train['Policy_Number']!=T[0]
for i in range(1,len(T)):
    TF = np.logical_and(TF,train['Policy_Number']!=T[i])
train = train[TF]
train = train.reset_index(drop=True)

test = pd.read_csv('../data/testing_policy_c2.csv')
train_y = train['Next_Premium'].copy()
train_y2 = (train_y/train['Premium_sum'])
test_PN = test['Policy_Number']

del train['Next_Premium']
del test['Next_Premium']
del train['Policy_Number']
del test['Policy_Number']

train_x = train
test_x = test

In [3]:
N1,N2= 2000, 0
d00 = div1(train_y,N1,N2)
P00,P01 = train_model(train_x,d00,test_x,train_x)

Mask000,Mask001,Mask002 = P00==0,P00==1,P00==2
Mask010,Mask011,Mask012 = P01==0,P01==1,P01==2

Mask1 = np.logical_and(train_y2>0.2,train_y2<1.15)
A00,A01 = modelfit(train_x,train_y2,test_x,Mask1)
A00[Mask002] = 0
A01[Mask012] = 0
print(mean_absolute_error(train_y,A00))

1581.2489958128638


In [4]:
Mask003,Mask013 = np.logical_and(Mask000,train_x['Premium_sum']>3859),np.logical_and(Mask010,test_x['Premium_sum']>3859)

N1,N2= 1.1,0.9
d10 = div1(train_y2[Mask003],N1,N2)
P10,P11 = train_model(train_x[Mask003],d10,test_x[Mask013],train_x[Mask003])

Mask100,Mask101,Mask102 = and2(Mask003,P10==0),and2(Mask003,P10==1),and2(Mask003,P10==2)
Mask110,Mask111,Mask112 = and2(Mask013,P11==0),and2(Mask013,P11==1),and2(Mask013,P11==2)

A10,A11 = A00.copy(),A01.copy()

Mask1 = np.logical_and(train_y2>0.2,train_y2<1.35)
A12,A13 = modelfit(train_x,train_y2,test_x,Mask1)
A10[Mask100] = A12[Mask100]
A11[Mask110] = A13[Mask110]

Mask1 = np.logical_and(train_y2>0.7,train_y2<1.1)
A12,A13 = modelfit(train_x,train_y2,test_x,Mask1)
A10[Mask101] = A12[Mask101]
A11[Mask111] = A13[Mask111]

print(mean_absolute_error(train_y,A10))

1566.9884728105758


In [5]:
def Log_and(T,N1,N2):
    return np.logical_and(T>N1,T<=N2)

T20,T21 = A10/train_x['Premium_sum'],A11/test_x['Premium_sum']

Mask200,Mask210 = Log_and(T20,0.00, 0.76),Log_and(T21,0.00, 0.76)
Mask201,Mask211 = Log_and(T20,0.76, 0.80),Log_and(T21,0.76, 0.80)
Mask202,Mask212 = Log_and(T20,0.80, 0.86),Log_and(T21,0.80, 0.86)
Mask203,Mask213 = Log_and(T20,0.87, 0.90),Log_and(T21,0.87, 0.90)

A20,A21 = A10.copy(),A11.copy()

Mask1 = np.logical_and(train_y2>0.35,train_y2<0.8)
A22,A23 = modelfit(train_x,train_y2,test_x,Mask1)
A20[Mask200] = A22[Mask200]
A21[Mask210] = A23[Mask210]

Mask1 = np.logical_and(train_y2>0.55,train_y2<0.85)
A22,A23 = modelfit(train_x,train_y2,test_x,Mask1)
A20[Mask201] = A22[Mask201]
A21[Mask211] = A23[Mask211]

Mask1 = np.logical_and(train_y2>0.6,train_y2<1.0)
A22,A23 = modelfit(train_x,train_y2,test_x,Mask1)
A20[Mask202] = A22[Mask202]
A21[Mask212] = A23[Mask212]

Mask1 = np.logical_and(train_y2>0.55,train_y2<1.0)
A22,A23 = modelfit(train_x,train_y2,test_x,Mask1)
A20[Mask203] = A22[Mask203]
A21[Mask213] = A23[Mask213]

print(mean_absolute_error(train_y,A20))

1545.9712601806716


In [6]:
d30 = div2(train_y2[Mask000],[1.4,1.2,1.12,0.81,0.73,0])
P30,P31 = train_model(train_x[Mask000],d30,test_x[Mask010],train_x[Mask000],7)

Mask300,Mask301,Mask304,Mask306 = and2(Mask000,P30==0),and2(Mask000,P30==1),and2(Mask000,P30==4),and2(Mask000,P30==6)
Mask310,Mask311,Mask314,Mask316 = and2(Mask010,P31==0),and2(Mask010,P31==1),and2(Mask010,P31==4),and2(Mask010,P31==6)

A30,A31 = A20.copy(),A21.copy()

Mask1 = np.logical_and(train_y2>0.65,train_y2<2.0)
A32,A33 = modelfit(train_x,train_y2,test_x,Mask1)
A30[Mask300] = A32[Mask300]
A31[Mask310] = A33[Mask310]

Mask1 = np.logical_and(train_y2>0.1,train_y2<1.6)
A32,A33 = modelfit(train_x,train_y2,test_x,Mask1)
A30[Mask301] = A32[Mask301]
A31[Mask311] = A33[Mask311]

Mask1 = np.logical_and(train_y2>0.6,train_y2<0.85)
A32,A33 = modelfit(train_x,train_y2,test_x,Mask1)
A30[Mask304] = A32[Mask304]
A31[Mask314] = A33[Mask314]

Mask1 = np.logical_and(train_y2>0.05,train_y2<0.75)
A32,A33 = modelfit(train_x,train_y2,test_x,Mask1)
A30[Mask306] = A32[Mask306]
A31[Mask316] = A33[Mask316]

print(mean_absolute_error(train_y,A30))

1522.7630871776992


In [7]:
output_csv = pd.DataFrame({"Policy_Number":test_PN ,"Next_Premium":A31})
output_csv.to_csv("13.csv", sep=',', index=False, header=True, encoding='utf_8_sig')