# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error

from warnings import simplefilter
import gc

simplefilter('ignore')
rs = 69420
train_path = r'../input/tabular-playground-series-aug-2021/train.csv'
test_path = r'../input/tabular-playground-series-aug-2021/test.csv'
submission_path = r'../input/tabular-playground-series-aug-2021/sample_submission.csv'

# Preprocessing

In [None]:
train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
y = train.loss.values
X = train.drop(["loss"], axis = 1).values

X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=rs, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
sc = RobustScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# HistGradientBoostingRegressor

In [None]:
clf = HistGradientBoostingRegressor(
    learning_rate=0.01,
    max_iter=10000,
    random_state=rs,
    early_stopping=True
)

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {round(rmse, 5)}")

# Cross Validated

In [None]:
test_preds=None
scores = []

print("Training...")

kf = StratifiedKFold(n_splits = 10 , shuffle = True , random_state = rs)
for fold, (tr_index , val_index) in enumerate(kf.split(X , y)):
    print("⁙" * 20)
    print(f"Fold {fold + 1}")
    
    X_train, X_val = X[tr_index] , X[val_index]
    y_train, y_val = y[tr_index] , y[val_index]
    
    sc = RobustScaler()
    X_train = sc.fit_transform(X_train)
    X_val = sc.transform(X_val)
        
    eval_set = [(X_val, y_val)]
    
    model = HistGradientBoostingRegressor(
        learning_rate=0.01,
        max_iter=10000,
        random_state=rs,
        early_stopping=True
    )
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)    
    val_preds = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    scores.append(rmse)
    print(f"RMSE: {round(rmse, 5)}")
    
    test_sub = sc.transform(test)
    if test_preds is None:
        test_preds = model.predict(test_sub)
    else:
        test_preds += model.predict(test_sub)

print("-" * 20)
print("Training Done!")
print(f"Mean RMSE: {round(np.mean(scores), 5)}")

test_preds /= 10

# Submission

In [None]:
submission = pd.read_csv(submission_path)
submission['loss'] = test_preds
submission.to_csv("submission.csv", index=False)