In [None]:
import joblib

import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score

from src import (
    Dataset,
    resample_data_by_10min,
    generate_full_data,
    filter_nan_days,
    encode_datetime,
    merge_external,
    parse_target,
    post_process,
    calculate_metrics,
    create_samples,
    add_location_details,
)

In [None]:
class CustomDataset(Dataset):
    def pre_process(self, data, upload):
        dataset = {"train": {}, "test": {}}
        feature_columns = [
            "LocationCode", "month", "day", "hour", "minute", "timestamp",
            "latitude", "longitude", "orientation", "altitude",
            
            "466990 Station Pressure (hPa)",
            "466990 Sea Level Pressure (hPa)", "466990 Temperature (°C)",
            "466990 Dew Point (°C)", "466990 Relative Humidity (%)",
            "466990 Wind Speed (m/s)", "466990 Wind Direction (°)",
            "466990 Max Wind Speed (m/s)", "466990 Max Wind Direction (°)",
            "466990 Precipitation (mm)", "466990 Precipitation Duration (h)",
            "466990 Sunshine Duration (h)", "466990 Solar Radiation (MJ/m²)",
            "466990 Visibility (km)", "466990 UV Index",
            "466990 Cloud Amount (0-10)", "466990 Ground Temp (0cm)",
            "466990 Ground Temp (5cm)", "466990 Ground Temp (10cm)",
            "466990 Ground Temp (20cm)", "466990 Ground Temp (30cm)",
            "466990 Ground Temp (50cm)", "466990 Ground Temp (100cm)",

            "72T250 Station Pressure (hPa)", "72T250 Sea Level Pressure (hPa)",
            "72T250 Temperature (°C)", "72T250 Dew Point (°C)",
            "72T250 Relative Humidity (%)", "72T250 Wind Speed (m/s)",
            "72T250 Wind Direction (°)", "72T250 Max Wind Speed (m/s)",
            "72T250 Max Wind Direction (°)", "72T250 Precipitation (mm)",
            "72T250 Sunshine Duration (h)", "72T250 Solar Radiation (MJ/m²)",
            "72T250 Ground Temp (0cm)", "72T250 Ground Temp (5cm)",
            "72T250 Ground Temp (10cm)", "72T250 Ground Temp (20cm)",
            "72T250 Ground Temp (50cm)", "72T250 Ground Temp (100cm)",

            "C0Z100 Max Wind Speed (m/s)", "C0Z100 Max Wind Direction (°)",
            "C0Z100 Precipitation (mm)", "C0Z100 Solar Radiation (MJ/m²)",
            "C0Z100 Station Pressure (hPa)", "C0Z100 Temperature (°C)",
            "C0Z100 Relative Humidity (%)", "C0Z100 Wind Speed (m/s)",
            "C0Z100 Wind Direction (°)",

            "IFENGL2 Temperature", "IFENGL2 Dew Point",
            "IFENGL2 Humidity", "IFENGL2 Wind", "IFENGL2 Speed", "IFENGL2 Gust",
            "IFENGL2 Pressure", "IFENGL2 Precip. Rate.", "IFENGL2 Precip. Accum.",
            "IFENGL2 UV", "IFENGL2 Solar", 

            "IHUALIEN3 Temperature", "IHUALIEN3 Dew Point", 
            "IHUALIEN3 Humidity", "IHUALIEN3 Wind", "IHUALIEN3 Speed",
            "IHUALIEN3 Gust", "IHUALIEN3 Pressure", "IHUALIEN3 Precip. Rate.", 
            "IHUALIEN3 Precip. Accum.", "IHUALIEN3 UV", "IHUALIEN3 Solar"
        ]
        target_column = ["Power(mW)"]

        data = (
            generate_full_data(data, start_time="09:00", end_time="16:59")
            .pipe(resample_data_by_10min)
            .pipe(filter_nan_days)
            .pipe(merge_external, external_file="data/external.csv")
            .pipe(encode_datetime)
            .pipe(add_location_details)
        )
        reference_data = data.copy()
        dataset["train"] = create_samples(data, reference_data, feature_columns, target_column)

        if upload is not None:
            upload = (
                parse_target(upload)
                .pipe(merge_external, external_file="data/external.csv")
                .pipe(encode_datetime)
                .pipe(add_location_details)
            )
            dataset["test"] = create_samples(upload, reference_data, feature_columns, target_column)

        return dataset

dataset = CustomDataset(data_file="data/all_data.csv",upload_file="data/upload.csv")
print(dataset)

In [None]:
id = 36

lgbm = LGBMRegressor(num_leaves=65535, verbosity=-1)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
scores = cross_val_score(lgbm, dataset["train"]["X"], dataset["train"]["y"], cv=kf, scoring="neg_mean_absolute_error", )
print("Average MAE:", -scores.mean())

lgbm = LGBMRegressor(num_leaves=65535, verbosity=-1)
lgbm.fit(dataset["train"]["X"], dataset["train"]["y"])
print(calculate_metrics(dataset["train"]["y"], lgbm.predict(dataset["train"]["X"])))
joblib.dump(lgbm, f"{id}_lgbm.pkl")

predictions = lgbm.predict(dataset["test"]["X"])
upload = pd.read_csv("data/upload.csv")
upload["答案"] = post_process(predictions)
upload.to_csv(f"{id}.csv", index=False)