In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

In [2]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet').drop(columns='date_forecast')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet').drop(columns='date_forecast')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet').drop(columns='date_forecast')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet').drop(columns=['date_forecast', 'date_calc'])
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet').drop(columns=['date_forecast', 'date_calc'])
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet').drop(columns=['date_forecast', 'date_calc'])
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet').drop(columns=['date_forecast', 'date_calc'])
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet').drop(columns=['date_forecast', 'date_calc'])
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet').drop(columns=['date_forecast', 'date_calc'])

In [3]:


# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

A = A[['pv_measurement', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
       'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
       'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
       'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sfc_pressure:hPa', 'snow_depth:cm', 'snow_melt_10min:mm',
       'snow_water:kgm2', 'sun_azimuth:d', 'sun_elevation:d',
       'super_cooled_liquid_water:kgm2', 't_1000hPa:K', 'total_cloud_cover:p',
       'visibility:m', 'wind_speed_10m:ms', 'wind_speed_u_10m:ms',
       'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']]

B = B[['pv_measurement', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
         'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'dew_or_rime:idx',
         'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
         'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
         'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
         'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
         'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
         'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
         'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
         'sfc_pressure:hPa', 'snow_depth:cm', 'snow_melt_10min:mm',
         'snow_water:kgm2', 'sun_azimuth:d', 'sun_elevation:d',
         'super_cooled_liquid_water:kgm2', 't_1000hPa:K', 'total_cloud_cover:p',
         'visibility:m', 'wind_speed_10m:ms', 'wind_speed_u_10m:ms',
         'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']]

C = C[['pv_measurement', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
         'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'dew_or_rime:idx',
         'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
         'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m',
         'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm',
         'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx',
         'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm',
         'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa',
         'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
         'sfc_pressure:hPa', 'snow_depth:cm', 'snow_melt_10min:mm',
         'snow_water:kgm2', 'sun_azimuth:d', 'sun_elevation:d',
         'super_cooled_liquid_water:kgm2', 't_1000hPa:K', 'total_cloud_cover:p',
         'visibility:m', 'wind_speed_10m:ms', 'wind_speed_u_10m:ms',
         'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']]

A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

In [7]:
obs_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_A.columns]
est_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_A.columns]
test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]

obs_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_B.columns]
est_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_B.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]

obs_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_C.columns]
est_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_C.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Value used in lightGBM
X_train_A = obs_A.drop(columns='pv_measurement')
y_train_A = obs_A['pv_measurement']
X_test_A = est_A.drop(columns='pv_measurement')
y_test_A = est_A['pv_measurement']

X_train_B = obs_B.drop(columns='pv_measurement')
y_train_B = obs_B['pv_measurement']
X_test_B = est_B.drop(columns='pv_measurement')
y_test_B = est_B['pv_measurement']

X_train_C = obs_C.drop(columns='pv_measurement')
y_train_C = obs_C['pv_measurement']
X_test_C = est_C.drop(columns='pv_measurement')
y_test_C = est_C['pv_measurement']

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']



In [None]:

train_data_A = lgb.Dataset(X_train_A, label=y_train_A)
val_data_A = lgb.Dataset(X_test_A, label=y_test_A, reference=train_data_A)

train_data_B = lgb.Dataset(X_train_B, label=y_train_B)
val_data_B = lgb.Dataset(X_test_B, label=y_test_B, reference=train_data_B)

train_data_C = lgb.Dataset(X_train_C, label=y_train_C)
val_data_C = lgb.Dataset(X_test_C, label=y_test_C, reference=train_data_C)

# Set the parameters for the model
params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'verbose': 1
}

In [None]:
# Train the model (evaluation)
gbm_A = lgb.train(params,
                train_data_A,
                num_boost_round=500,
                valid_sets=[val_data_A],
                early_stopping_rounds=50,
                verbose_eval=50)

gbm_B = lgb.train(params,
                train_data_B,
                num_boost_round=500,
                valid_sets=[val_data_B],
                early_stopping_rounds=50,
                verbose_eval=50)

gbm_C = lgb.train(params,
                train_data_C,
                num_boost_round=500,
                valid_sets=[val_data_C],
                early_stopping_rounds=50,
                verbose_eval=50)

In [6]:
# Train the model (submission)

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'verbose': 1
}

A = lgb.Dataset(X_A, label=y_A)
B = lgb.Dataset(X_B, label=y_B)
C = lgb.Dataset(X_C, label=y_C)


gbm_A = lgb.train(params,
                A,
                num_boost_round=350,
                early_stopping_rounds=50,
                verbose_eval=50)

gbm_B = lgb.train(params,
                B,
                num_boost_round=350,
                early_stopping_rounds=50,
                verbose_eval=50)

gbm_C = lgb.train(params,
                C,
                num_boost_round=350,
                early_stopping_rounds=50,
                verbose_eval=50)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7353
[LightGBM] [Info] Number of data points in the train set: 34085, number of used features: 40
[LightGBM] [Info] Start training from score 630.594707




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7359
[LightGBM] [Info] Number of data points in the train set: 32843, number of used features: 40
[LightGBM] [Info] Start training from score 96.827726




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7688
[LightGBM] [Info] Number of data points in the train set: 26095, number of used features: 41
[LightGBM] [Info] Start training from score 77.631060


In [None]:
# Evaluate the model
y_pred_A = gbm_A.predict(X_test_A, num_iteration=gbm_A.best_iteration)
y_pred_B = gbm_B.predict(X_test_B, num_iteration=gbm_B.best_iteration)
y_pred_C = gbm_C.predict(X_test_C, num_iteration=gbm_C.best_iteration)

print('MAE A:', mean_absolute_error(y_test_A, y_pred_A))
print('MAE B:', mean_absolute_error(y_test_B, y_pred_B))
print('MAE C:', mean_absolute_error(y_test_C, y_pred_C))

In [None]:
y_pred_A_test = gbm_A.predict(X_test_A)

# Plotting the actual vs predicted values
plt.figure(figsize=(10,5))
plt.scatter(range(len(y_test_A)), y_test_A, color='blue', label='Actual')
plt.scatter(range(len(y_pred_A_test)), y_pred_A_test, color='red', label='Predicted', alpha=0.5)
plt.title('Actual vs Predicted Values')
plt.xlabel('Test Sample Index')
plt.ylabel('Target Value')
plt.legend()
plt.show()

In [None]:
test_A.describe()

In [8]:
# Create submission

output_file = 'LGBM_submission.csv'

pred_A = gbm_A.predict(test_A)
pred_B = gbm_B.predict(test_B)
pred_C = gbm_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to LGBM_submission.csv


In [None]:
gbm_A.params