In [None]:
import pandas as pd

from src import (
    Dataset,
    resample_data_by_10min,
    generate_full_data,
    filter_nan_days,
    encode_datetime,
    merge_external,
    parse_target,
    post_process,
    calculate_metrics,
    create_samples,
)

In [None]:
class CustomDataset(Dataset):
    def pre_process(self, data, upload):
        dataset = {"train": {}, "test": {}}
        feature_columns = [
            "LocationCode", "month", "day", "hour", "minute", "timestamp",
            "PS01","PS02","TX01","TD01","RH01","WD01","WD02","WD07","WD08","PP01","SS01","GR01","TS03"
        ]
        target_column = ["Power(mW)"]

        data = (
            generate_full_data(data, start_time="09:00", end_time="16:59")
            .pipe(resample_data_by_10min)
            .pipe(filter_nan_days)
            .pipe(merge_external, external_file="data/10min.csv")
            .pipe(encode_datetime)
        )

        dataset["train"] = create_samples(data, data, feature_columns, target_column)

        if upload is not None:
            upload = (
                parse_target(upload)
                .pipe(merge_external, external_file="data/10min.csv")
                .pipe(encode_datetime)
            )
            dataset["test"] = create_samples(upload, data, feature_columns, target_column)

        return dataset

dataset = CustomDataset(data_file="./data/all_data.csv",upload_file="./data/upload.csv")
print(dataset)

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score

lgbm = LGBMRegressor(num_leaves=4095, verbosity=1)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
scores = cross_val_score(lgbm, dataset["train"]["X"], dataset["train"]["y"], cv=kf, scoring="neg_mean_absolute_error", )

print("Average MAE:", -scores.mean())

In [None]:
lgbm = LGBMRegressor(num_leaves=4095, verbosity=-1)
lgbm.fit(dataset["train"]["X"], dataset["train"]["y"])
calculate_metrics(dataset["train"]["y"], lgbm.predict(dataset["train"]["X"]))

In [None]:
predictions = lgbm.predict(dataset["test"]["X"])
upload = pd.read_csv("data/upload.csv")
upload["答案"] = post_process(predictions)
upload.to_csv("submission.csv", index=False)

In [None]:
import joblib
joblib.dump(lgbm, "lgbm.pkl")