In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

import time
import warnings
warnings.simplefilter('ignore')

In [None]:
# adjust Jupyter views
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read and Load data
allstate=pd.read_csv('/kaggle/input/allstate-claims-severity/train.csv', index_col='id')
test=pd.read_csv('/kaggle/input/allstate-claims-severity/test.csv', index_col='id')
submission=pd.read_csv('/kaggle/input/allstate-claims-severity/sample_submission.csv', index_col='id')
print(allstate.shape, test.shape, submission.shape)

In [None]:
allstate.head()

In [None]:
test.head()

In [None]:
allstate.info(verbose=True)

In [None]:
test.info(verbose=True)

In [None]:
allstate.describe()

In [None]:
test.describe()

In [None]:
# evaluate whether any value is missing in train
allstate.isnull().values.any()

In [None]:
# evaluate whether any value is missing in test
test.isnull().values.any()

In [None]:
# asses target
sns.distplot(allstate['loss'], kde=False, fit=norm)

In [None]:
allstate.describe()['loss']

In [None]:
percent_outliers = sum(allstate['loss']>20000)/len(allstate['loss'])*100
print('Loss less than 20000 accounts for {0:.2f}% of data'.format(percent_outliers))

In [None]:
# remove data where loss is more than 20000
train=allstate.drop(allstate[allstate['loss']>20000].index)
len(train)

In [None]:
X = train.copy()
y = np.log(X.pop('loss'))

In [None]:
non_scalar=list(X.select_dtypes(np.object))

for i in non_scalar:
    X[i] = X[i].astype('category').cat.codes

X.head()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled,y,test_size=.1, random_state=43)

In [None]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
print(test.shape, submission.shape)

In [None]:
# Loading data into DMatrices
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [None]:
# Build a baseline model

mean_train = np.mean(y_train)
baseline_predictions = np.ones(y_val.shape) * mean_train

mae_baseline = mean_absolute_error(y_val, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))

In [None]:
# Let’s define it with default values for the moment
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
}

In [None]:
params['eval_metric'] = "mae"
num_boost_round = 999   # set it to large value

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dval, "Val")],
    early_stopping_rounds=10
)

In [None]:
print("Best MAE: {:.5f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

In [None]:
# cross-validation score with our current default parameters
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=43,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

In [None]:
cv_results['test-mae-mean'].min()

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [None]:
# define initial best params and MAE
min_mae = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    
    # update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    # run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=43,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=5
    )
    
    # update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['max_depth'] = 9
params['min_child_weight'] = 7

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [None]:
min_mae = float("Inf")
best_params = None

# start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    
    # update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    
    # run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    # update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
params['subsample'] = 1.0
params['colsample_bytree'] = 0.8

In [None]:
%time

min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    # update our parameters
    params['eta'] = eta
    
    # run and time CV
    %time 
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=10
    )
    
    # update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

In [None]:
params['eta'] = .05

In [None]:
params

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dval, "Val")],
    early_stopping_rounds=10
)

In [None]:
non_scalar=list(test.select_dtypes(np.object))

for i in non_scalar:
    test[i] = test[i].astype('category').cat.codes

In [None]:
test = test.values
dtest = xgb.DMatrix(test)

In [None]:
predictions = model.predict(dtest)

In [None]:
predictions=np.exp(predictions)-1
submission['loss']=predictions
submission.to_csv('Result.csv')
submission.head()