# Step1. Import and Setup

In [None]:
from pathlib import Path
import pandas as pd
from src import (
    merge_csv,
    prepare_external_data,
    generate_full_data,
    resample_data_by_10min,
    merge_external,
    encode_datetime,
    add_location_details,
    parse_target,
    create_samples,
    train_and_predict, 
    create_ensemble_submission
)

output_folder = Path("AICUP")
external_data_folder = Path("ExternalData")

input_folder = [Path("TrainingData"), Path("TrainingData_Additional")]
upload_template = Path("TestSet_SubmissionTemplate/upload(no answer).csv")

# Step2. Data Preprocessing (optional)
Skip this step if you already have `train_x.csv`, `train_y.csv`, and `test_x.csv` in your folder.

In [None]:
training_data = merge_csv(input_folder, output_folder)
external_data = prepare_external_data(external_data_folder, output_folder)

data = (
    generate_full_data(training_data, start_time="08:00", end_time="16:59")
    .pipe(resample_data_by_10min)
    .dropna()
    .pipe(merge_external, external_data)
    .pipe(encode_datetime)
    .pipe(add_location_details)
)

reference_data = data.copy()
feature_columns = [col for col in data.columns if col not in [
    "DateTime", "WindSpeed(m/s)", "Pressure(hpa)", "Temperature(°C)", 
    "Humidity(%)", "Sunlight(Lux)", "Power(mW)"
]]
create_samples(data, external_data, reference_data, feature_columns, output_folder)

upload = (
    pd.read_csv(upload_template)
    .pipe(parse_target)
    .pipe(merge_external, external_data)
    .pipe(encode_datetime)
    .pipe(add_location_details)
)
create_samples(upload, external_data, reference_data, feature_columns, output_folder)

# Step3. Data Loading

In [None]:
dataset = {
    "train": {
        "X": pd.read_csv(f"{output_folder}/train_x.csv"),
        "y": pd.read_csv(f"{output_folder}/train_y.csv").squeeze()
    },
    "test": {
        "X": pd.read_csv(f"{output_folder}/test_x.csv")
    }
}

print("Shapes of the data:")
print(f"Train X: {dataset['train']['X'].shape}")
print(f"Train y: {dataset['train']['y'].shape}")
print(f"Test X: {dataset['test']['X'].shape}")

# Step4. Model Training (CatBoost、LightGBM、XGBoost)

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(iterations=int(1e7), task_type="GPU", verbose=int(1e5))
train_and_predict(
    model=model,
    model_name="catboost",
    dataset=dataset,
    upload_template=upload_template,
    output_folder=output_folder,
)

In [None]:
from lightgbm import LGBMRegressor
model = LGBMRegressor(num_leaves=int(2**15 - 1))
train_and_predict(
    model=model,
    model_name="lightgbm",
    dataset=dataset,
    upload_template=upload_template,
    output_folder=output_folder,
)

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=int(1e6), learning_rate=0.001, tree_method="hist", device="cuda")
train_and_predict(
    model=model,
    model_name="xgboost",
    dataset=dataset,
    upload_template=upload_template,
    output_folder=output_folder,
)

# Step5. Ensemble

In [None]:
create_ensemble_submission(
    model_preds=[
        output_folder / "catboost_pred.csv",
        output_folder / "lightgbm_pred.csv",
        output_folder / "xgboost_pred.csv"
    ],
    upload_template=upload_template,
    output_file= output_folder / "submission.csv"
)