In [6]:
import pandas as pd
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
import time
from sklearn.preprocessing import LabelEncoder

In [7]:
train = pd.read_csv("train_with_seasonal_cluster.csv")
test = pd.read_csv("test_with_seasonal_cluster.csv")

In [18]:
train_cluster_date = train.drop(columns=["ID"])

In [15]:
params = {
    'boosting_type': ['gbdt'],
    'objective': ['regression'],
    'metric': ['rmse'],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [31, 50, 100, 200],
    'max_depth': [3, 5, 10, 15],
    'min_data_in_leaf': [20, 50, 75, 100],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [1, 5, 10],
    'lambda_l1': [0, 0.1, 1.0],
    'lambda_l2': [0, 0.1, 1.0],
    'min_split_gain': [0.0, 0.01, 0.1],
    'verbosity': [-1],
    'random_state': [42],
    'n_jobs': [-1],
    'device': ['cpu']
}
rmse_scorer = make_scorer(mean_squared_error, squared=False)

In [9]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

In [20]:
train_cluster_date['lag_1'] = train_cluster_date.groupby('cluster_id')['electricity_consumption'].shift(1)
train_cluster_date['lag_4'] = train_cluster_date.groupby('cluster_id')['electricity_consumption'].shift(4)
train_cluster_date['rolling_4'] = train_cluster_date.groupby('cluster_id')['electricity_consumption'].shift(1).rolling(4).mean()
train_cluster_date = train_cluster_date.dropna().reset_index(drop=True)

In [19]:
X = train_cluster_date.drop(columns=['electricity_consumption'])
y = train_cluster_date['electricity_consumption']

In [21]:
model = LGBMRegressor(n_estimators=1000)

In [22]:
tscv = TimeSeriesSplit(n_splits=5)

In [23]:
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=20,
    scoring=rmse_scorer,
    cv=tscv,
    verbose=1,
    n_jobs=-1
)

In [24]:
search.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [29]:
# Concatenate last few rows from train with test
combined = pd.concat([train_cluster_date[['cluster_id', 'electricity_consumption']].iloc[-8:], test], ignore_index=True)

# Apply lag features based on combined data
combined['lag_1'] = combined['electricity_consumption'].shift(1)
combined['lag_4'] = combined['electricity_consumption'].shift(4)
combined['rolling_4'] = combined['electricity_consumption'].shift(1).rolling(4).mean()

# Now slice only the test part (exclude extra train rows)
X_test = combined.iloc[8:].copy()

# Drop 'electricity_consumption' if it's still there
X_test = X_test.drop(columns=['electricity_consumption', 'ID'], errors='ignore')

In [30]:
best_model = search.best_estimator_

In [31]:
y_pred = best_model.predict(X_test)

In [32]:
submission = pd.DataFrame({
    "ID": test["ID"],  # if ID column is available
    "electricity_consumption": y_pred
})
submission.to_csv("submission14.csv", index=False)