# 30 Days of Machine Learning by Kaggle
### Aug 16, 2021 to Aug 30, 2021
### https://www.kaggle.com/c/30-days-of-ml/
### Predicting the amount of an insurance claim (with original data synthesized and features anonymized)

Notebook Author:

| Name:  | Pradip Kumar Das  |
| ------------: | :------------ |
| **Profile:** |  [LinkedIn](https://www.linkedin.com/in/daspradipkumar/) I [GitHub](https://github.com/PradipKumarDas "GitHub") I [Kaggle](https://www.kaggle.com/pradipkumardas "Kaggle")  |
|  **Contact:** | pradipkumardas@hotmail.com (Email)  |
| **Location:** | Bengaluru, India|

**Sections:**
- Dependencies
- Exploratory Data Analysis (EDA) & Preprocessing
- Modeling & Evaluation
- Submission

## Dependencies

In [1]:
# Loads required packages

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from xgboost import XGBRegressor, cv

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Initializations

random_state = 1
max_leaf_nodes = [5, 50, 500, 5000]
cross_val_folds = 5
random_forests_regressor_estimators = 100
xgboost_regressor_estimators = 100

## Exploratory Data Analysis (EDA) & Preprocessing

In [3]:
# Loads data

train = pd.read_csv(".data/train.csv")
test = pd.read_csv(".data/test.csv")
submission = pd.read_csv(".data/sample_submission.csv")

In [None]:
# Views data
train

In [None]:
test

In [None]:
# Checks for data types
train.dtypes

In [None]:
# Checks for missing values

pd.DataFrame(train.isna().sum(), columns=["Train_Missing"]).join(
    pd.DataFrame(test.isna().sum(), columns=["Test_Missing"]))

In [None]:
# Checks whether any categorical values that appear in test data set but not in train data set

print(train.select_dtypes(include="object").apply(lambda col: col.unique(), axis=0))
print(test.select_dtypes(include="object").apply(lambda col: col.unique(), axis=0))

In [None]:
# Checks for basic statistics of the training data
train.describe()

In [None]:
train.target.hist()

In [4]:
bins = pd.cut(train.target, bins=10, labels=list(range(10)), ordered=True)

cv = StratifiedKFold(n_splits=5)

In [5]:
# Seperates predictor variables from target

y = train.target

X = train.drop(["id", "target"], axis=1)

## Modeling & Evaluation

### Sets Pipeline

In [6]:
# List categorical variables
categorical_columns = X.select_dtypes("object").columns.tolist()

# Sets pipelines to preprocess categorical data
#categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False))])
categorical_transformer = Pipeline(steps=[("ordinal", OrdinalEncoder())]) #minor improvement over one-hot encoder

numeric_columns = X.select_dtypes("float").columns.tolist()

#numeric_transformer = Pipeline(steps=[("std_scaled", StandardScaler())])
numeric_transformer = Pipeline(steps=[("std_scaled", MinMaxScaler())]) #minor improvement over standard scaling

# Bundle preprocessing steps for all categorical variables
preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_columns),
    ('numeric', numeric_transformer, numeric_columns)
])

### Decision Tree

**Decision Tree with Default Parameters**

In [None]:
cv_generator = cv.split(train, bins)

# Creates model object
model = DecisionTreeRegressor(random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cv_generator, scoring='neg_mean_squared_error')

In [None]:
print("Root Mean Squared Error (RMSE) of Baselined (Decision Tree) Model:", np.sqrt(scores.mean()))

### Random Forests

**Ramdom Forests with Default Parameters**

In [None]:
cv_generator = cv.split(train, bins)

# Creates model object with required parameters
model = RandomForestRegressor(random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cv_generator, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)

In [None]:
print("Root Mean Squared Error (RMSE) of Random Forest (with default parameters) Model:", np.sqrt( scores.mean()))

### Gradient Boosted Trees

**XGBoost with Default Parameters**

In [None]:
cv_generator = cv.split(train, bins)

# Creates model object with required parameters
model = XGBRegressor(random_state=random_state)

# Sets pipeline to preprocess categorical data followed by model training
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# Performs cross-validation 
scores = -1 * cross_val_score(
    model_pipeline, X, y, cv=cv_generator, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)

In [None]:
print("Root Mean Squared Error (RMSE) of XGBoost Regressor (with default parameters) Model:", np.sqrt( scores.mean()))

**Automated Hyperparameter Tuning with Hyperopt**

In [None]:
# Extracts processed data to use in XGBoost
X_train = preprocessor.fit_transform(X)

cv_generator = cv.split(train, bins)

for fold, (idx_train, idx_val) in enumerate(cv_generator):
    x_train = X_train[idx_train]
    y_train = y.iloc[idx_train]
    x_val = X_train[idx_val]
    y_val = y.iloc[idx_val]
    break

In [8]:
# Creates training and validation data in an intermediate structure that XGBoost requires for model training
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dval = xgb.DMatrix(data=x_val, label=y_val)

In [17]:
# Sets up a search space for XGBoost hyperparameters
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.quniform("max_depth", 2, 6, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
    'reg_alpha' : hp.uniform('reg_alpha', 1e-8, 100),
    'reg_lambda' : hp.uniform('reg_lambda', 1e-8, 100),
    'gamma': hp.uniform ('gamma', 0.0, 1.0),
    'subsample': hp.uniform("subsample", 0.1, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0)
}

In [18]:
def trial_loss(space):
    """
    Trial function for Hyperopt to call by passing a set a trial hyperparamets
    to train model and perform predictions.
    
    Parameters:
    ----------
    space: A set a trial hyperparamets
    
    Returns metric for Hyperopt to estimate for further tuning in search space.
    """
    
    # Converts parameter value to int as required by XGBoost
    space["max_depth"] = int(space["max_depth"])
    space["objective"] = "reg:squarederror"
    space["tree_method"] = "gpu_hist"
    
    model = xgb.train(
        space, dtrain, 
        num_boost_round=1000, 
        evals=[(dval, 'eval'), (dtrain, 'train')],
        early_stopping_rounds=100, verbose_eval=False)
    
    predictions = model.predict(dval)
    
    rmse = mean_squared_error(y_val, predictions, squared=False)
    
    # Flags variables to be deleted
    del predictions, model
    
    return {"loss": rmse, "status": STATUS_OK}

In [19]:
# Begins hyperparameter tuning
# (this takes time depending upon how large search space is)
trials = Trials()
best_trial = fmin(fn=trial_loss, space=space, algo=tpe.suggest, max_evals=200, trials=trials)

100%|██████████| 200/200 [09:32<00:00,  2.86s/trial, best loss: 0.7181252418713265]


In [26]:
print(best_trial)

{'colsample_bytree': 0.10053766574830636, 'gamma': 0.001969493617053307, 'learning_rate': 0.17602061890621307, 'max_depth': 4.0, 'min_child_weight': 2.0, 'reg_alpha': 24.956570827069402, 'reg_lambda': 44.74305000233839, 'subsample': 0.7875939722688653}


## Submission

**Prepares final XGBoost model with optimized parameters**

In [25]:
X_test = preprocessor.fit_transform(test.drop(["id"], axis=1))

dtest = xgb.DMatrix(data=X_test)

In [67]:
test_predictions = []

cv_generator = cv.split(train, bins)
for fold, (idx_train, idx_val) in enumerate(cv_generator):
    print("fold", fold)
    x_train = X_train[idx_train]
    y_train = y.iloc[idx_train]
    x_val = X_train[idx_val]
    y_val = y.iloc[idx_val]
    
    dtrain = xgb.DMatrix(data=x_train, label=y_train)
    dval = xgb.DMatrix(data=x_val, label=y_val)
    
    best_trial["max_depth"] = int(best_trial["max_depth"])
    best_trial["objective"] = "reg:squarederror"
    
    model = xgb.train(
        best_trial, 
        dtrain, 
        num_boost_round=1000, 
        evals=[(dval, 'eval'), (dtrain, 'train')],
        early_stopping_rounds=100, verbose_eval=100)
    
    predictions = model.predict(dtest)
    
    print("Predicted on test data set.")
    test_predictions.append(predictions)

fold 0
[0]	eval-rmse:6.42229	train-rmse:6.42342
[100]	eval-rmse:0.72793	train-rmse:0.72621
[200]	eval-rmse:0.72152	train-rmse:0.71762
[300]	eval-rmse:0.71964	train-rmse:0.71424
[400]	eval-rmse:0.71855	train-rmse:0.71198
[500]	eval-rmse:0.71803	train-rmse:0.71026
[600]	eval-rmse:0.71777	train-rmse:0.70866
[700]	eval-rmse:0.71757	train-rmse:0.70736
[800]	eval-rmse:0.71746	train-rmse:0.70634
[900]	eval-rmse:0.71743	train-rmse:0.70537
[999]	eval-rmse:0.71746	train-rmse:0.70448
Predicted on test data set.
fold 1
[0]	eval-rmse:6.42398	train-rmse:6.42294
[100]	eval-rmse:0.72912	train-rmse:0.72577
[200]	eval-rmse:0.72272	train-rmse:0.71738
[300]	eval-rmse:0.72064	train-rmse:0.71386
[400]	eval-rmse:0.71982	train-rmse:0.71155
[500]	eval-rmse:0.71908	train-rmse:0.70983
[600]	eval-rmse:0.71879	train-rmse:0.70829
[700]	eval-rmse:0.71851	train-rmse:0.70704
[800]	eval-rmse:0.71850	train-rmse:0.70592
[900]	eval-rmse:0.71850	train-rmse:0.70486
[999]	eval-rmse:0.71852	train-rmse:0.70399
Predicted on tes

In [68]:
# Sets target columns with predictions
submission["target"] = np.mean(np.column_stack(test_predictions),axis=1)

# Checks for sumbission file before saving
submission

Unnamed: 0,id,target
0,0,8.071482
1,5,8.408277
2,15,8.427138
3,16,8.497139
4,17,8.109150
...,...,...
199995,499987,7.994358
199996,499990,8.496794
199997,499991,8.557329
199998,499994,8.209156


In [69]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False)