In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer
import time
from sklearn.preprocessing import LabelEncoder

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [6]:
train = pd.read_csv("train_with_seasonal_cluster.csv")
test = pd.read_csv("test_with_seasonal_cluster.csv")

In [7]:
param_dist = {
    'n_estimators': [100, 300, 500, 800],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 3, 5]
}

In [11]:
target_col = "electricity_consumption"

In [None]:
def train_and_eval_tuned(df_input, label):
    X = df_input.drop(columns=[target_col])
    y = df_input[target_col]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    xgb = XGBRegressor(objective="reg:squarederror", random_state=42)

    # Use RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=50,  
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"Tuned {label} RMSE: {rmse:.4f}")
    print(f"Best params: {random_search.best_params_}")

    return best_model

In [19]:
train_cluster_date = train.drop(columns=["ID"])

In [20]:
model = train_and_eval_tuned(train_cluster_date, "With seasonal and clusterid features")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Tuned With seasonal and clusterid features RMSE: 27.9672
Best params: {'subsample': 0.6, 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.6}


In [21]:
X_test = test.drop(columns=["ID"]) 

In [22]:
y_test_pred = model.predict(X_test)

In [23]:
submission = pd.DataFrame({
    "ID": test["ID"],
    "electricity_consumption": y_test_pred
})
submission.to_csv("submission8_50can.csv", index=False)