# XGBoost

In [1]:
import xgboost
from xgboost import plot_importance
from bgpredict.helpers import S3Connection
from dotenv import load_dotenv
import multiprocessing
import json
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import sqlalchemy
from sqlalchemy import create_engine
from datetime import datetime
from skopt import BayesSearchCV
from skopt.plots import plot_objective, plot_histogram
import matplotlib.pyplot as plt 

In [2]:
os.chdir('../')
load_dotenv()
os.chdir('./Baseline')

In [3]:
def clean_data(df):
    # Drop rows with no Y value
    df = df.dropna(subset='bg')

    # Fill nulls (lag BG values) with 0 to indicate data is unavailable
    print(f"Null values to be filled by column:")
    nulls = df.isna().sum()
    null_idx = list(nulls.index)
    vals = list(nulls)
    for col, val in list(zip(null_idx, vals)):
        if val > 0:
            print(col, val)
    df = df.fillna(0)

    # Sort by timestamp
    current_len = len(df)
    df = df.loc[~(df['timestamp_clean'].apply(type) == int), :]
    print(f"Dropping {current_len - len(df)} invalid timestamps")
    df = df.sort_values(by="timestamp_clean")

    # Set index to time_stamp_clean
    df.index = df['timestamp_clean']
    df = df.drop(labels=['timestamp_clean'], axis=1)

    # Drop first row by subject which has data quality issues
    df = df[df.groupby('subjectid').cumcount() > 0]

    # Drop columns that are indices, irrelevant, or capture in OHE variables
    drop_cols = ['timestamp', 'date', 'time']
    df = df.drop(labels=drop_cols, axis=1)

    # One hot Encode Weekdays
    weekdays = np.unique(df['weekday'])
    ohe_weekdays = [f"ohe_{day}" for day in weekdays]
    df[ohe_weekdays] = pd.get_dummies(df.weekday)
    df = df.drop(labels="weekday", axis=1)

    return df

In [4]:
location = f"postgresql://postgres:{os.environ.get('db_password')}@{os.environ.get('db_location')}"
engine = create_engine(location)
conn = engine.connect()
raw_df = pd.read_sql("select * from public.vw_final_dataset limit 10000", conn)
clean_df = clean_data(raw_df)

Null values to be filled by column:
timestamp 2
timestamp_clean 2
date 2
time 2
weekday 2
hour 2
minute 2
datediff_currentbg_lastbg_inseconds 3
bg_lag_1 88
bg_lag_2 86
bg_lag_3 77
bg_lag_4 85
bg_lag_5 87
bg_lag_6 75
bg_lag_7 77
bg_lag_8 85
bg_lag_9 94
bg_lag_10 83
bg_lag_11 93
bg_lag_12 96
Dropping 2 invalid timestamps


In [5]:
def create_cv_splits(df):
    df = df.reset_index()
    subjects = np.unique(df['subjectid'])
    cv_splits = []
    for subject in subjects:
        train_idx = np.array(df.loc[(df['subjectid']==subject) & (df['train_set']==1)].index)
        val_idx = np.array(df.loc[(df['subjectid']==subject) & (df['validation_set']==1)].index)
        if len(train_idx) < 5 or len(val_idx) < 5:
            continue 
        split = (train_idx, val_idx)
        cv_splits.append(split)
    return cv_splits

In [6]:
def train_test_cv_split(df):
    clean_df_reset = df.reset_index()
    
    train_X = clean_df_reset.loc[(clean_df_reset['train_set'] == 1) | (clean_df_reset['validation_set'] == 1) , clean_df_reset.columns != 'bg']
    train_y = clean_df_reset.loc[(clean_df_reset['train_set'] == 1) | (clean_df_reset['validation_set'] == 1), clean_df_reset.columns == 'bg']
    
    test_X = clean_df_reset.loc[(clean_df_reset['test_set'] == 1), clean_df_reset.columns != 'bg']
    test_y = clean_df_reset.loc[(clean_df_reset['test_set'] == 1), clean_df_reset.columns == 'bg']
    cv_splits = create_cv_splits(train_X)

    drop_cols = ['train_set', 'validation_set', 'test_set', 'subjectid', 'entryid', 'timestamp_clean']
    train_X = train_X.drop(labels=drop_cols, axis=1)
    test_X = test_X.drop(labels=drop_cols, axis=1)
    
    return train_X, train_y,test_X, test_y, cv_splits

In [7]:
[0.5 + i/10 for i in range(0, 6)]

[0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [8]:
train_X, train_y, test_X, test_y, cv_splits = train_test_cv_split(clean_df[1500:2500])

n_cpu = os.cpu_count()

param = {'max_depth': 2,
      "learning_rate": [0.1,0.2,0.3], #
      "gamma" : 1, # 0 to inf; minimum loss reduction required to make a partition. Larger is more conservative
      "min_child_weight": 3, # minimum hessian weight of leaf nodes, o to inf
      "n_estimators": 2, #equivalent to num rounds
      "reg_lambda": 5, # l2, 0 to inf
      "reg_alpha": 0.1, # l1, 0 to inf
      "n_estimators":range(50,400,40),
     }

start = datetime.now()
# params_test = {"n_estimators":range(50,400,40)}
params_test = {"max_depth": [i for i in range(2, 20, 2)],
               "learning_rate": [0.01, 0.1,0.2,0.3, 0.5, 0.75, 1],
               "n_estimators":[x for x in range(50,400,50)],
               "reg_alpha": [0,0.0001,0.001,0.01,0.1],
               "reg_lambda":[0,0.0001,0.001,0.01,0.1, 0.2, 0.3, 0.5, 0.75, 1, 2, 3],
               "gamma": [0, 0.25, 0.5, 1.0, 2, 4, 8, 16],
               "min_child_weight": [0.25, 0.5, 1, 3, 5, 7],
               "colsample_bytree": [0.5 + i/10 for i in range(0, 6)],
               "colsample_bylevel": [0.5 + i/10 for i in range(0, 6)],
               "colsample_bynode": [0.5 + i/10 for i in range(0, 6)],
               "subsample": [0.5 + i/10 for i in range(0, 6)],
               "max_delta_step": [i for i in range(0,11)]
              }

regressor = xgboost.XGBRegressor(eval_metric="rmse", verbosity=0)

cv_search = BayesSearchCV(estimator=regressor,
                              search_spaces = params_test,
                              n_iter=24,
                              n_jobs=n_cpu, 
                              cv=cv_splits,
                              verbose=1)

cv_search.fit(train_X, train_y)
print(f'Finished in: {datetime.now()-start}')

print("val. score: %s" % cv_search.best_score_)
print("test score: %s" % cv_search.score(test_X, test_y))
print("best params: %s" % str(cv_search.best_params_))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [None]:
_ = plot_objective(cv_search.optimizer_results_[0])

In [None]:
best_estimator = cv_search.best_estimator_
best_estimator.save_model('./model.json')

In [None]:
model_name = f"{datetime.now().time()}_L{len(train_X)}".replace(":", ".")
location = f"models/xgboost/{model_name}"
location

In [None]:
s3_conn = S3Connection()
bucket = s3_conn.bucket_name
with open('./model.json') as f:
    model = json.load(f)
    s3_conn.s3_client.put_object(Bucket=bucket, Key=location, Body=json.dumps(model))
os.remove("./model.json")