# 30 Days of Machine Learning by Kaggle
### Aug 16, 2021 to Aug 30, 2021
### https://www.kaggle.com/c/30-days-of-ml/
### Predicting the amount of an insurance claim (with original data synthesized and features anonymized)

Notebook Author:

| Name:  | Pradip Kumar Das  |
| ------------: | :------------ |
| **Profile:** |  [LinkedIn](https://www.linkedin.com/in/daspradipkumar/) I [GitHub](https://github.com/PradipKumarDas "GitHub") I [Kaggle](https://www.kaggle.com/pradipkumardas "Kaggle")  |
|  **Contact:** | pradipkumardas@hotmail.com (Email)  |
| **Location:** | Bengaluru, India|

**Sections:**
- Dependencies
- Exploratory Data Analysis (EDA) & Preprocessing
- Modeling & Evaluation
- Submission

## Dependencies

In [2]:
# Loads required packages

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from xgboost import XGBRegressor, cv

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Initializations

random_state = 1
max_leaf_nodes = [5, 50, 500, 5000]
cross_val_folds = 5
random_forests_regressor_estimators = 100
xgboost_regressor_estimators = 100

## Exploratory Data Analysis (EDA) & Preprocessing

In [4]:
# Loads data

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [5]:
# Views data
train

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.668980,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,B,B,A,A,B,D,A,E,A,...,0.769792,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605
299996,499996,A,B,A,C,B,B,A,E,E,...,0.528056,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118
299997,499997,B,B,A,C,B,C,A,E,G,...,0.688747,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755
299998,499998,A,B,A,C,B,B,A,E,E,...,0.344404,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569


In [6]:
test

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0,B,B,B,C,B,B,A,E,E,...,0.476739,0.376350,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,5,A,B,A,C,B,C,A,E,C,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.693940
2,15,B,A,A,A,B,B,A,E,D,...,0.697272,0.683600,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,16,B,B,A,C,B,D,A,E,A,...,0.719306,0.777890,0.730954,0.644315,1.024017,0.391090,0.988340,0.411828,0.393585,0.461372
4,17,B,B,A,C,B,C,A,E,C,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,499987,B,A,A,C,B,D,A,E,E,...,0.287454,0.543800,0.682378,1.028978,1.022741,0.683903,0.877273,0.532410,0.605397,0.884581
199996,499990,B,A,A,C,B,B,A,E,C,...,0.794881,0.432778,0.389775,0.359871,0.550013,0.492082,0.202295,0.416875,0.406205,0.758665
199997,499991,A,B,B,C,B,B,A,E,C,...,0.514487,0.060997,0.171741,0.317185,0.150340,0.122109,0.390524,0.334026,0.378987,0.839416
199998,499994,A,A,A,C,B,D,A,D,A,...,0.286144,1.061710,0.819811,0.901241,0.555339,0.844315,0.894193,0.794102,0.844279,0.890473


In [7]:
# Checks for data types
train.dtypes

id          int64
cat0       object
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cont0     float64
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
cont11    float64
cont12    float64
cont13    float64
target    float64
dtype: object

In [8]:
# Checks for missing values

pd.DataFrame(train.isna().sum(), columns=["Train_Missing"]).join(
    pd.DataFrame(test.isna().sum(), columns=["Test_Missing"]))

Unnamed: 0,Train_Missing,Test_Missing
id,0,0.0
cat0,0,0.0
cat1,0,0.0
cat2,0,0.0
cat3,0,0.0
cat4,0,0.0
cat5,0,0.0
cat6,0,0.0
cat7,0,0.0
cat8,0,0.0


In [9]:
# Checks whether any categorical values that appear in test data set but not in train data set

print(train.select_dtypes(include="object").apply(lambda col: col.unique(), axis=0))
print(test.select_dtypes(include="object").apply(lambda col: col.unique(), axis=0))

cat0                                           [B, A]
cat1                                           [B, A]
cat2                                           [B, A]
cat3                                     [C, A, D, B]
cat4                                     [B, C, A, D]
cat5                                     [B, D, C, A]
cat6                         [A, B, C, H, D, I, G, E]
cat7                         [E, F, D, B, G, C, A, I]
cat8                            [C, A, G, E, F, D, B]
cat9    [N, O, F, K, M, I, G, H, L, B, A, J, D, C, E]
dtype: object
cat0                                           [B, A]
cat1                                           [B, A]
cat2                                           [B, A]
cat3                                     [C, A, D, B]
cat4                                     [B, C, D, A]
cat5                                     [B, C, D, A]
cat6                         [A, B, C, I, D, H, E, G]
cat7                         [E, G, B, D, F, A, C, I]
cat8          

In [10]:
# Checks for basic statistics of the training data
train.describe()

Unnamed: 0,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,250018.576947,0.527335,0.460926,0.490498,0.496689,0.491654,0.510526,0.467476,0.537119,0.498456,0.474872,0.474492,0.473216,0.494561,0.508273,8.241979
std,144450.15001,0.230599,0.214003,0.253346,0.219199,0.240074,0.228232,0.210331,0.21814,0.23992,0.218007,0.255949,0.222022,0.247292,0.22295,0.746555
min,1.0,-0.118039,-0.069309,-0.056104,0.130676,0.255908,0.045915,-0.224689,0.203763,-0.260275,0.117896,0.048732,0.052608,-0.074208,0.15105,0.140329
25%,124772.5,0.405965,0.310494,0.300604,0.329783,0.284188,0.354141,0.342873,0.355825,0.332486,0.306874,0.276017,0.308151,0.289074,0.300669,7.742071
50%,250002.5,0.497053,0.427903,0.502462,0.465026,0.39047,0.488865,0.429383,0.504661,0.439151,0.43462,0.459975,0.433812,0.422887,0.4724,8.191373
75%,375226.5,0.66806,0.615113,0.647512,0.664451,0.696599,0.669625,0.573383,0.703441,0.606056,0.614333,0.691579,0.642057,0.714502,0.758447,8.728634
max,499999.0,1.058443,0.887253,1.034704,1.03956,1.055424,1.067649,1.111552,1.032837,1.040229,0.982922,1.05596,1.071444,0.975035,0.905992,10.411992


In [11]:
# Seperates predictor variables from target

y = train.target

X = train.drop(["id", "target"], axis=1)

## Modeling & Evaluation

### Sets Pipeline

In [12]:
# List categorical variables
categorical_columns = train.select_dtypes("object").columns.tolist()

# Sets pipelines to preprocess categorical data
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False))])

# Bundle preprocessing steps for all categorical variables
preprocessor = ColumnTransformer(transformers=[('categorical', categorical_transformer, categorical_columns)])

### Decision Tree for Baselined Performance

**Decision Tree with Default Parameters**

In [13]:
# Creates model object
model = DecisionTreeRegressor(random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [14]:
# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cross_val_folds, scoring='neg_mean_squared_error')

In [19]:
print("Root Mean Squared Error (RMSE) of Baselined (Decision Tree) Model:", np.sqrt(scores.mean()))

Root Mean Squared Error (RMSE) of Baselined (Decision Tree) Model: 0.7685053291480958


### Random Forests

**Ramdom Forests with Default Parameters**

In [20]:
# Creates model object with required parameters
model = RandomForestRegressor(n_estimators=random_forests_regressor_estimators, random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [23]:
# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cross_val_folds, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.3min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  4.4min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.5min finished


In [24]:
print("Root Mean Squared Error (RMSE) of Random Forest (with default parameters) Model:", np.sqrt( scores.mean()))

Root Mean Squared Error (RMSE) of Random Forest (with default parameters) Model: 0.7572159022647208


### Gradient Boosted Trees

**XGBoost with Default Parameters**

In [25]:
# Creates model object with required parameters
model = XGBRegressor(n_estimators = xgboost_regressor_estimators, random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [26]:
# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cross_val_folds, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.9min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.9min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min finished


In [27]:
print("Root Mean Squared Error (RMSE) of XGBoost Regressor (with default parameters) Model:", np.sqrt( scores.mean()))

Root Mean Squared Error (RMSE) of XGBoost Regressor (with default parameters) Model: 0.7464984914224029


**Fine Tuning XGBoost**

Continuing with XGBoost as it performed better than other models in cross-validation with its default parameters.

In [13]:
# Extracts processed data to use in XGBoost
X_train = preprocessor.fit_transform(X)

In [14]:
# Creates training data in an intermediate structure that XGBoost requires for model training
dtrain = xgb.DMatrix(data=X_train, label=y)

In [14]:
# Sets initial objective of the model training
param = {'objective':'reg:squarederror'}

In [15]:
# Performs cross validation setting high boosting rounds with early stopping setting
# to estimate number of boosting rounds required
scores = xgb.cv(param, dtrain, num_boost_round=1000, nfold=cross_val_folds,
             metrics={'rmse'}, verbose_eval=False, seed=0, show_stdv=False,
             callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False), xgb.callback.EarlyStopping(50)])

[0]	train-rmse:5.47048	test-rmse:5.47048
[1]	train-rmse:3.86613	test-rmse:3.86613
[2]	train-rmse:2.75808	test-rmse:2.75809
[3]	train-rmse:2.00258	test-rmse:2.00259
[4]	train-rmse:1.49926	test-rmse:1.49935
[5]	train-rmse:1.17640	test-rmse:1.17665
[6]	train-rmse:0.98004	test-rmse:0.98043
[7]	train-rmse:0.86768	test-rmse:0.86826
[8]	train-rmse:0.80686	test-rmse:0.80762
[9]	train-rmse:0.77525	test-rmse:0.77617
[10]	train-rmse:0.75923	test-rmse:0.76029
[11]	train-rmse:0.75121	test-rmse:0.75240
[12]	train-rmse:0.74720	test-rmse:0.74849
[13]	train-rmse:0.74517	test-rmse:0.74658
[14]	train-rmse:0.74412	test-rmse:0.74565
[15]	train-rmse:0.74357	test-rmse:0.74520
[16]	train-rmse:0.74325	test-rmse:0.74499
[17]	train-rmse:0.74306	test-rmse:0.74489
[18]	train-rmse:0.74294	test-rmse:0.74485
[19]	train-rmse:0.74283	test-rmse:0.74485
[20]	train-rmse:0.74275	test-rmse:0.74484
[21]	train-rmse:0.74267	test-rmse:0.74487
[22]	train-rmse:0.74262	test-rmse:0.74490
[23]	train-rmse:0.74255	test-rmse:0.74493
[2

In [16]:
# Checks scores
scores

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,5.470483,0.000354,5.470485,0.002098
1,3.866131,0.000266,3.866132,0.002186
2,2.758083,0.000242,2.758091,0.002234
3,2.002578,0.000276,2.002589,0.002296
4,1.499258,0.000349,1.49935,0.002384
5,1.1764,0.00044,1.176646,0.002507
6,0.98004,0.00052,0.980426,0.002597
7,0.867684,0.000586,0.86826,0.002658
8,0.806855,0.000629,0.807616,0.002718
9,0.775252,0.000654,0.776168,0.002745


From the above scores, it reflects that 20 boosting rounds will be optimal.

**Preparing for Automated Hyperparameter Tuning with Hyperopt**

In [23]:
# Sets up a search space for XGBoost hyperparameters
space = {
    'max_depth': hp.quniform("max_depth", 2, 8, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
    'gamma': hp.uniform ('gamma', 0.0, 1.0),
    'subsample': hp.uniform("subsample", 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
}

In [24]:
def trial_loss(space):
    """
    Trial function for Hyperopt to call by passing a set a trial hyperparamets
    to train model and perform predictions.
    
    Parameters:
    ----------
    space: A set a trial hyperparamets
    
    Returns metric for Hyperopt to estimate for further tuning in search space.
    """
    
    # Converts parameter value to int as required by XGBoost
    space["max_depth"] = int(space["max_depth"])
    
    # Splits data into training and validation set
    x_train, x_test, y_train, y_test = train_test_split(X_train, y, test_size=0.20)
    
    # Sets data into intermediate structure required by XGBoost
    dtrain = xgb.DMatrix(data=x_train, label=y_train)
    dtest = xgb.DMatrix(data=x_test, label=y_test)
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
 
    # Performs model training, predictions and checks for error
    model = xgb.train(space, dtrain, num_boost_round=20, evals=evallist, verbose_eval=False)
    predictions = model.predict(dtest)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    
    # Flags variables to be deleted
    del dtrain, dtest, x_train, x_test, y_train, y_test
    
    return {"loss": rmse, "status": STATUS_OK, "model": model}

In [26]:
# Begins hyperparameter tuning
# (this takes time depending upon how large search space is)
trials = Trials()
best_trial = fmin(fn=trial_loss, space=space, algo=tpe.suggest, timeout=3600, trials=trials)

  0%|          | 1411/9223372036854775807 [1:00:00<6537507899664044:56:32,  2.55s/trial, best loss: 0.7374367702949155]


In [27]:
print(best_trial)

{'colsample_bytree': 0.8915545434554547, 'gamma': 0.4467031985876537, 'max_depth': 2.0, 'min_child_weight': 4.0, 'subsample': 0.8083053066146875}


In [15]:
#best_trial = {'colsample_bytree': 0.8915545434554547, 'gamma': 0.4467031985876537, 'max_depth': 2.0, 'min_child_weight': 4.0, 'subsample': 0.8083053066146875}

## Submission

**Prepares final XGBoost model with optimized parameters**

In [18]:
# converts to int according to the requirement of xgb.cv function for this parameter
best_trial["max_depth"] = int(best_trial["max_depth"])

# Sets additional parameters for traning model
params = {
    "n_estimators": 20,
    "learning_rate": 0.3,
    "objective": "reg:squarederror"
}

# Includes optimized hyperparameters with the additional parameters 
params.update(best_trial)

In [21]:
# Creates model object with required parameters
model = XGBRegressor(**params, random_state=random_state)

# Sets pipeline to preprocess categorical data in call categorical variables
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [22]:
# Fits the model through pipeline
model_pipeline.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['cat0', 'cat1', 'cat2',
                                                   'cat3', 'cat4', 'cat5',
                                                   'cat6', 'cat7', 'cat8',
                                                   'cat9'])])),
                ('model',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=0.89...
                              gamma=0.4467031985876537, gpu_id=-1,
                              importance_type='g

In [23]:
# Performs predictions on the test data with the model after dropping 'id' column from the data
predictions = model_pipeline.predict(test.drop(["id"], axis=1))

In [29]:
# Updates prediction in the test data set

submission["target"] = predictions

# Saves the data frame into .csv file for submission
submission.to_csv("./submission.csv", index=False)