In [1]:
import pandas as pd
import numpy as np
import sklearn.ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_error
import optuna

# 1. Data preprocessing

In [2]:
X_train_orig = pd.read_csv("Xtrain_hgcGIrA.csv", parse_dates=['date'])
Y_train_orig = pd.read_csv("Ytrain_yL5OjS4.csv")
X_test_orig = pd.read_csv("Xtest.csv", parse_dates=['date'])

Xtrain = pd.DataFrame.copy(X_train_orig)
Y_train = pd.DataFrame.copy(Y_train_orig)['p0q0']
Xtest = pd.DataFrame.copy(X_test_orig)

In [3]:
def data_label(X, X_test):
    X['day'] = X['date'].dt.weekday
    X_test['day'] = X_test['date'].dt.weekday

    # if hour = NA, then hour_id = -1
    X['hour_id'] = X['hour'].fillna(-1)
    X_test['hour_id'] = X_test['hour'].fillna(-1)
    
    le_1 = LabelEncoder()
    le_1.fit(list(X['hour']) + list(X_test['hour']))
    choose_not_na = (X['hour_id'] != -1)
    X.loc[choose_not_na, 'hour_id'] = le_1.transform(X.loc[choose_not_na, 'hour'])
    choose_not_na_t = (X_test['hour_id'] != -1)
    X_test.loc[choose_not_na_t, 'hour_id'] = le_1.transform(X_test.loc[choose_not_na_t, 'hour'])
     
    le_2 = LabelEncoder()
    le_2.fit(list(X['station']) + list(X_test['station']))
    X['station_id'] = le_2.transform(X['station'])
    X_test['station_id'] = le_2.transform(X_test['station'])

    return X, X_test

In [4]:
def data_drop(X, X_test):
    cols_to_keep = ['p1q0', 'p2q0', 'p3q0', 'p0q1', 'p0q2', 'p0q3', 'hour_id', 'station_id', 'train', 'day']
    X_drop = X.loc[:, cols_to_keep]
    X_drop_test = X_test.loc[:, cols_to_keep]
    
    return X_drop, X_drop_test

In [5]:
def na_lag_0(X, X_test):
    X_imput = X.fillna(0)
    X_imput_test = X_test.fillna(0)
    
    return X_imput, X_imput_test

In [6]:
X_train, X_test = na_lag_0(*data_drop(*data_label(Xtrain, Xtest)))

# 2. Model selection

In [7]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, random_state=26, test_size=0.2)

In [8]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=mean_absolute_error)
models, predictions = reg.fit(x_train, x_val, y_train, y_val)

 21%|██▏       | 9/42 [00:16<01:36,  2.92s/it]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


 79%|███████▊  | 33/42 [06:36<01:31, 10.22s/it]

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [07:02<00:00, 10.05s/it]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Number of data points in the train set: 24895, number of used features: 10
[LightGBM] [Info] Start training from score 0.249139


In [9]:
print(models.loc[pd.to_numeric(models['mean_absolute_error']) < 0.011, ['mean_absolute_error', 'RMSE', 'Adjusted R-Squared']].sort_values(by='mean_absolute_error'))

                       mean_absolute_error  RMSE  Adjusted R-Squared
Model                                                               
ExtraTreesRegressor                   0.01  0.02                0.99
RandomForestRegressor                 0.01  0.02                0.99


# 3. Hyperparameter optimization

In [10]:
def objective(trial):
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, random_state=26, test_size=0.2)
    
    n_estimators = trial.suggest_int("n_estimators", 500, 4000, step=500)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 15, step=1)
    
    reg = sklearn.ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, n_jobs=-1, random_state=26).fit(x_train, y_train)
    
    y_pred = reg.predict(x_val)

    return np.abs(y_pred - y_val).mean()

In [11]:
study = optuna.create_study()
study.optimize(objective, n_trials=50)

[I 2023-11-12 19:59:22,184] A new study created in memory with name: no-name-e3f4a3c7-98bc-4ba1-be00-45594bab579e
[I 2023-11-12 19:59:26,895] Trial 0 finished with value: 0.00992073682206869 and parameters: {'n_estimators': 1000, 'min_samples_split': 12}. Best is trial 0 with value: 0.00992073682206869.
[I 2023-11-12 19:59:46,135] Trial 1 finished with value: 0.00992211243620301 and parameters: {'n_estimators': 4000, 'min_samples_split': 12}. Best is trial 0 with value: 0.00992073682206869.
[I 2023-11-12 20:00:00,726] Trial 2 finished with value: 0.00990218674858264 and parameters: {'n_estimators': 2000, 'min_samples_split': 5}. Best is trial 2 with value: 0.00990218674858264.
[I 2023-11-12 20:00:12,913] Trial 3 finished with value: 0.0098630927001898 and parameters: {'n_estimators': 2000, 'min_samples_split': 7}. Best is trial 3 with value: 0.0098630927001898.
[I 2023-11-12 20:00:15,637] Trial 4 finished with value: 0.009868031832409056 and parameters: {'n_estimators': 500, 'min_sampl

In [12]:
print(study.best_params)
print(study.best_value)

{'n_estimators': 3000, 'min_samples_split': 8}
0.009852676629771991


# 4. The final model

In [13]:
model = sklearn.ensemble.ExtraTreesRegressor(**study.best_params, n_jobs=-1, random_state=26).fit(X_train, Y_train)
Y_ = model.predict(X_test)
Y_

array([0.2421138 , 0.23728241, 0.24400864, ..., 0.1058166 , 0.15369956,
       0.11229617])

In [14]:
result = pd.DataFrame(data={'p0q0': list(Y_)})
result[""] = range(1, len(Y_) + 1)
result.to_csv('Y_3000_8.csv', header=True, index=False, columns=['', 'p0q0'])