In [None]:
import joblib

import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score

from src import (
    Dataset,
    resample_data_by_10min,
    generate_full_data,
    filter_nan_days,
    encode_datetime,
    merge_external,
    parse_target,
    post_process,
    calculate_metrics,
    create_samples,
    add_location_details,
)

In [None]:
class CustomDataset(Dataset):
    def pre_process(self, data, upload):
        dataset = {"train": {}, "test": {}}
        feature_columns = [
            "LocationCode", "month", "day", "hour", "minute", "timestamp",
            "latitude", "longitude", "orientation", "altitude",
            
            'DONGHUA Max Wind Speed (m/s)',
            'DONGHUA Max Wind Direction (°)', 'DONGHUA Precipitation (mm)',
            'DONGHUA Solar Radiation (MJ/m²)', 'DONGHUA Station Pressure (hPa)',
            'DONGHUA Temperature (°C)', 'DONGHUA Relative Humidity (%)',
            'DONGHUA Wind Speed (m/s)', 'DONGHUA Wind Direction (°)',
            'HUALIEN Station Pressure (hPa)', 'HUALIEN Sea Level Pressure (hPa)',
            'HUALIEN Temperature (°C)', 'HUALIEN Dew Point (°C)',
            'HUALIEN Relative Humidity (%)', 'HUALIEN Wind Speed (m/s)',
            'HUALIEN Wind Direction (°)', 'HUALIEN Max Wind Speed (m/s)',
            'HUALIEN Max Wind Direction (°)', 'HUALIEN Precipitation (mm)',
            'HUALIEN Precipitation Duration (h)', 'HUALIEN Sunshine Duration (h)',
            'HUALIEN Solar Radiation (MJ/m²)', 'HUALIEN Visibility (km)',
            'HUALIEN UV Index', 'HUALIEN Cloud Amount (0-10)',
            'HUALIEN Ground Temp (0cm)', 'HUALIEN Ground Temp (5cm)',
            'HUALIEN Ground Temp (10cm)', 'HUALIEN Ground Temp (20cm)',
            'HUALIEN Ground Temp (30cm)', 'HUALIEN Ground Temp (50cm)',
            'HUALIEN Ground Temp (100cm)', 'IFENGL2 Temperature',
            'IFENGL2 Dew Point', 'IFENGL2 Humidity', 'IFENGL2 Wind',
            'IFENGL2 Speed', 'IFENGL2 Gust', 'IFENGL2 Pressure',
            'IFENGL2 Precip. Rate.', 'IFENGL2 Precip. Accum.', 'IFENGL2 UV',
            'IFENGL2 Solar', 'IHUALIEN3 Temperature', 'IHUALIEN3 Dew Point',
            'IHUALIEN3 Humidity', 'IHUALIEN3 Wind', 'IHUALIEN3 Speed',
            'IHUALIEN3 Gust', 'IHUALIEN3 Pressure', 'IHUALIEN3 Precip. Rate.',
            'IHUALIEN3 Precip. Accum.', 'IHUALIEN3 UV', 'IHUALIEN3 Solar'
        ]
        target_column = ["Power(mW)"]

        data = (
            generate_full_data(data, start_time="09:00", end_time="16:59")
            .pipe(resample_data_by_10min)
            .pipe(filter_nan_days)
            .pipe(merge_external, external_file="data/external.csv")
            .pipe(encode_datetime)
            .pipe(add_location_details)
        )
        reference_data = data.copy()
        dataset["train"] = create_samples(data, reference_data, feature_columns, target_column)

        if upload is not None:
            upload = (
                parse_target(upload)
                .pipe(merge_external, external_file="data/external.csv")
                .pipe(encode_datetime)
                .pipe(add_location_details)
            )
            dataset["test"] = create_samples(upload, reference_data, feature_columns, target_column)

        return dataset

dataset = CustomDataset(data_file="data/all_data.csv",upload_file="data/upload.csv")
print(dataset)

In [None]:
id = 36

lgbm = LGBMRegressor(num_leaves=65535, verbosity=-1)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
scores = cross_val_score(lgbm, dataset["train"]["X"], dataset["train"]["y"], cv=kf, scoring="neg_mean_absolute_error", )
print("Average MAE:", -scores.mean())

lgbm = LGBMRegressor(num_leaves=65535, verbosity=-1)
lgbm.fit(dataset["train"]["X"], dataset["train"]["y"])
print(calculate_metrics(dataset["train"]["y"], lgbm.predict(dataset["train"]["X"])))
joblib.dump(lgbm, f"{id}_lgbm.pkl")

predictions = lgbm.predict(dataset["test"]["X"])
upload = pd.read_csv("data/upload.csv")
upload["答案"] = post_process(predictions)
upload.to_csv(f"{id}.csv", index=False)