In [None]:
import numpy as np 
import pandas as pd
import os

from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import sklearn.metrics as metrics
from sklearn import preprocessing

import optuna
from optuna import Trial, visualization

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
train.drop(['id'],inplace = True,axis = 1)
test.drop(['id'],inplace = True,axis = 1)
train.head()

In [None]:
feature_cols = [col for col in test.columns.tolist()]

In [None]:
kf = KFold(n_splits = 3, random_state = 4022, shuffle = True)

for i, (trn, val) in enumerate(kf.split(train)):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [None]:
target = train['loss']
train.drop('loss',axis = 1,inplace = True)

In [None]:
#Distribution of loss is censored
fig = plt.figure(figsize = (14,8))
target_cnt = target.value_counts().sort_index()
sns.barplot(x = target_cnt.index,y = target_cnt)

In [None]:
scaler = preprocessing.StandardScaler()
train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.fit_transform(test[feature_cols])

In [None]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0, 2.0, 0.1),
        "n_estimators": trial.suggest_int("n_estimators",20,200,20),
        'subsample' : trial.suggest_discrete_uniform('subsample',0.5,0.9,0.1),
        'colsample_bytree' : trial.suggest_discrete_uniform('colsample_bytree',0.5,0.9,0.1),
        "eta": trial.suggest_uniform("eta",0.01,0.2),
        "max_depth": trial.suggest_int("max_depth",3,20),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
        'reg_lambda' : trial.suggest_int('reg_lambda',1,50),
        'min_child_weight' : trial.suggest_int('min_child_weight',5,20),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4)
    }
        
    model = xgb.XGBRegressor(
            objective='reg:tweedie',    
            tree_method= 'gpu_hist',
            predictor = 'gpu_predictor',
            booster = 'gbtree',
            n_jobs = 4,
            **params, random_state = 4022, eval_metric="rmse")
    
    model.fit(xtr, ytr.reshape(-1,))
    
    y_tr_pred = model.predict(xtr)
    y_val_pred = model.predict(xval)
    
    y_tr_pred = np.clip(y_tr_pred, 0.1, None)
    y_val_pred = np.clip(y_val_pred, 0.1, None)
    
    log = {
        "train rmse": np.sqrt(mean_squared_error(ytr, y_tr_pred)),
        "val rmse": np.sqrt(mean_squared_error(yval, y_val_pred))
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    for fold in range(3):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, target.loc[trn_idx].values
        xval, yval = val[feature_cols].values, target.loc[val_idx].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmse += log['val rmse']/3
        
    return rmse

In [None]:
from optuna.samplers import TPESampler

study = optuna.create_study(direction = "minimize", sampler = TPESampler(), study_name = 'pérdida')
study.optimize(objective, n_trials = 1)

In [None]:
study.best_params

In [None]:
# Best parameters found through 15 interations of optuna

params_opt = {
        'tweedie_variance_power': 1.8,
         'n_estimators': 120,
         'subsample': 0.8,
         'colsample_bytree': 0.7,
         'eta': 0.10856302051283587,
         'max_depth': 17,
         'reg_alpha': 37,
         'reg_lambda': 13,
         'min_child_weight': 20,
         'gamma': 35.89368788090382
             }

In [None]:
clf = xgb.XGBRegressor(**(params_opt))
clf.fit(train[feature_cols], target, eval_metric="rmse")

In [None]:
predictions = clf.predict(test[feature_cols])

In [None]:
submission = pd.DataFrame({
    'id': np.asarray(sample_submission.id), 
    'loss': predictions.astype(int)
})

submission.to_csv('my_submission.csv', index = False)