In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)

In [94]:
def keep_columns(df, columns):
    df = df[columns]
    return df

obs_A = pd.read_csv('./data/obs_A.csv')
est_A = pd.read_csv('./data/est_A.csv')
obs_B = pd.read_csv('./data/obs_B.csv')
est_B = pd.read_csv('./data/est_B.csv')
obs_C = pd.read_csv('./data/obs_C.csv')
est_C = pd.read_csv('./data/est_C.csv')

# Concatenate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Competition data
X_A = pd.read_parquet('../../data/A/X_test_estimated.parquet')
X_B = pd.read_parquet('../../data/B/X_test_estimated.parquet')
X_C = pd.read_parquet('../../data/C/X_test_estimated.parquet')

# Keep columns
columns_to_keep = [
    "absolute_humidity_2m:gm3",
    "air_density_2m:kgm3",
    "clear_sky_energy_1h:J",
    "clear_sky_rad:W",
    "dew_point_2m:K",
    "diffuse_rad:W",
    "diffuse_rad_1h:J",
    "direct_rad:W",
    "direct_rad_1h:J",
    "effective_cloud_cover:p",
    "elevation:m",
    "is_day:idx",
    "is_in_shadow:idx",
    "msl_pressure:hPa",
    "pressure_100m:hPa",
    "pressure_50m:hPa",
    "relative_humidity_1000hPa:p",
    "sfc_pressure:hPa",
    "snow_water:kgm2",
    "sun_azimuth:d",
    "sun_elevation:d",
    "super_cooled_liquid_water:kgm2",
    "t_1000hPa:K",
    "total_cloud_cover:p",
    "visibility:m",
    "wind_speed_10m:ms",
    "wind_speed_u_10m:ms",
    "wind_speed_v_10m:ms"
]

X_A = X_A.set_index('date_forecast').resample('1H').mean()
X_B = X_B.set_index('date_forecast').resample('1H').mean()
X_C = X_C.set_index('date_forecast').resample('1H').mean()

X_A = keep_columns(X_A, columns_to_keep)
X_B = keep_columns(X_B, columns_to_keep)
X_C = keep_columns(X_C, columns_to_keep)

X_A = X_A.dropna()
X_B = X_B.dropna()
X_C = X_C.dropna()

In [95]:
# Split into train and test
X_train_A, X_test_A = train_test_split(A, test_size=0.001, shuffle=False)
y_train_A = X_train_A['pv_measurement']
y_test_A = X_test_A['pv_measurement']
X_train_A = X_train_A.drop(columns=['pv_measurement'])
X_test_A = X_test_A.drop(columns=['pv_measurement'])

X_train_B, X_test_B = train_test_split(B, test_size=0.001, shuffle=False)
y_train_B = X_train_B['pv_measurement']
y_test_B = X_test_B['pv_measurement']
X_train_B = X_train_B.drop(columns=['pv_measurement'])
X_test_B = X_test_B.drop(columns=['pv_measurement'])

X_train_C, X_test_C = train_test_split(C, test_size=0.001, shuffle=False)
y_train_C = X_train_C['pv_measurement']
y_test_C = X_test_C['pv_measurement']
X_train_C = X_train_C.drop(columns=['pv_measurement'])
X_test_C = X_test_C.drop(columns=['pv_measurement'])

# Training models

In [96]:
# Random Forest for Location A
rf_A = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_A.fit(X_train_A, y_train_A)

# Random Forest for Location B
rf_B = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_B.fit(X_train_B, y_train_B)

# Random Forest for Location C
rf_C = RandomForestRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_C.fit(X_train_C, y_train_C)


In [82]:
# XGBoost for Location A
xgb_A = xgb.XGBRegressor(n_estimators=100, learning_rate=0.01, max_depth=10, random_state=0)
xgb_A.fit(X_train_A, y_train_A)

# XGBoost for Location B
xgb_B = xgb.XGBRegressor(n_estimators=100, learning_rate=0.01, max_depth=10, random_state=0)
xgb_B.fit(X_train_B, y_train_B)

# XGBoost for Location C
xgb_C = xgb.XGBRegressor(n_estimators=100, learning_rate=0.01, max_depth=10, random_state=0)
xgb_C.fit(X_train_C, y_train_C)

In [83]:
# Linear Regression for Location A
lr_A = LinearRegression()
lr_A.fit(X_train_A, y_train_A)

# Linear Regression for Location B
lr_B = LinearRegression()
lr_B.fit(X_train_B, y_train_B)

# Linear Regression for Location C
lr_C = LinearRegression()
lr_C.fit(X_train_C, y_train_C)

In [84]:
# StackingCVRegressor for Location A
stack_A = StackingCVRegressor(regressors=(rf_A, xgb_A, lr_A), meta_regressor=xgb_A, use_features_in_secondary=True)
stack_A.fit(X_train_A, y_train_A)

# StackingCVRegressor for Location B
stack_B = StackingCVRegressor(regressors=(rf_B, xgb_B, lr_B), meta_regressor=xgb_B, use_features_in_secondary=True)
stack_B.fit(X_train_B, y_train_B)

# StackingCVRegressor for Location C
stack_C = StackingCVRegressor(regressors=(rf_C, xgb_C, lr_C), meta_regressor=xgb_C, use_features_in_secondary=True)
stack_C.fit(X_train_C, y_train_C)

In [68]:
# Predict
pred_rf_A = rf_A.predict(X_test_A)
pred_rf_B = rf_B.predict(X_test_B)
pred_rf_C = rf_C.predict(X_test_C)

pred_xgb_A = xgb_A.predict(X_test_A)
pred_xgb_B = xgb_B.predict(X_test_B)
pred_xgb_C = xgb_C.predict(X_test_C)

pred_lr_A = lr_A.predict(X_test_A)
pred_lr_B = lr_B.predict(X_test_B)
pred_lr_C = lr_C.predict(X_test_C)

pred_stack_A = stack_A.predict(X_test_A)
pred_stack_B = stack_B.predict(X_test_B)
pred_stack_C = stack_C.predict(X_test_C)



In [69]:
# Evaluate
print('Random Forest')
print('Location A')
print('MAE:', mean_absolute_error(y_test_A, pred_rf_A))
print('MSE:', mean_squared_error(y_test_A, pred_rf_A))
print('R2:', r2_score(y_test_A, pred_rf_A))
print('Location B')
print('MAE:', mean_absolute_error(y_test_B, pred_rf_B))
print('MSE:', mean_squared_error(y_test_B, pred_rf_B))
print('R2:', r2_score(y_test_B, pred_rf_B))
print('Location C')
print('MAE:', mean_absolute_error(y_test_C, pred_rf_C))
print('MSE:', mean_squared_error(y_test_C, pred_rf_C))
print('R2:', r2_score(y_test_C, pred_rf_C))
print('XGBoost')
print('Location A')
print('MAE:', mean_absolute_error(y_test_A, pred_xgb_A))
print('MSE:', mean_squared_error(y_test_A, pred_xgb_A))
print('R2:', r2_score(y_test_A, pred_xgb_A))
print('Location B')
print('MAE:', mean_absolute_error(y_test_B, pred_xgb_B))
print('MSE:', mean_squared_error(y_test_B, pred_xgb_B))
print('R2:', r2_score(y_test_B, pred_xgb_B))
print('Location C')
print('MAE:', mean_absolute_error(y_test_C, pred_xgb_C))
print('MSE:', mean_squared_error(y_test_C, pred_xgb_C))
print('R2:', r2_score(y_test_C, pred_xgb_C))
print('Linear Regression')
print('Location A')
print('MAE:', mean_absolute_error(y_test_A, pred_lr_A))
print('MSE:', mean_squared_error(y_test_A, pred_lr_A))
print('R2:', r2_score(y_test_A, pred_lr_A))
print('Location B')
print('MAE:', mean_absolute_error(y_test_B, pred_lr_B))
print('MSE:', mean_squared_error(y_test_B, pred_lr_B))
print('R2:', r2_score(y_test_B, pred_lr_B))
print('Location C')
print('MAE:', mean_absolute_error(y_test_C, pred_lr_C))
print('MSE:', mean_squared_error(y_test_C, pred_lr_C))
print('R2:', r2_score(y_test_C, pred_lr_C))
print('StackingCVRegressor')
print('Location A')
print('MAE:', mean_absolute_error(y_test_A, pred_stack_A))
print('MSE:', mean_squared_error(y_test_A, pred_stack_A))
print('R2:', r2_score(y_test_A, pred_stack_A))
print('Location B')
print('MAE:', mean_absolute_error(y_test_B, pred_stack_B))
print('MSE:', mean_squared_error(y_test_B, pred_stack_B))
print('R2:', r2_score(y_test_B, pred_stack_B))
print('Location C')
print('MAE:', mean_absolute_error(y_test_C, pred_stack_C))
print('MSE:', mean_squared_error(y_test_C, pred_stack_C))
print('R2:', r2_score(y_test_C, pred_stack_C))

Random Forest
Location A
MAE: 172.94401941426761
MSE: 155792.9100403284
R2: 0.8560237608297434
Location B
MAE: 28.558703917139432
MSE: 4500.7378354373495
R2: 0.7768072464465352
Location C
MAE: 22.113094844265312
MSE: 2881.6395662971145
R2: 0.7983941048978477
XGBoost
Location A
MAE: 235.34695418968985
MSE: 327761.988693513
R2: 0.6970982924522007
Location B
MAE: 34.58801167284669
MSE: 8584.299568691657
R2: 0.5743023637194518
Location C
MAE: 25.725962315755325
MSE: 4186.352761747179
R2: 0.7071134760861391
Linear Regression
Location A
MAE: 234.46931598158366
MSE: 217987.10888715612
R2: 0.7985469035975663
Location B
MAE: 43.2212839323854
MSE: 22201.164320904645
R2: -0.10096148188425635
Location C
MAE: 32.727768902862174
MSE: 5727.678715713179
R2: 0.5992788939170743
StackingCVRegressor
Location A
MAE: 453.7795105950637
MSE: 1125822.3357310607
R2: -0.04043031117701834
Location B
MAE: 49.00736768064598
MSE: 19978.31112764603
R2: 0.009270382998469495
Location C
MAE: 43.46388485313307
MSE: 13772

In [97]:
pred_cv_reg_A = rf_A.predict(X_A)
pred_cv_reg_B = rf_B.predict(X_B)
pred_cv_reg_C = rf_C.predict(X_C)

# Save predictions
pred_A = pd.Series(pred_cv_reg_A, name="prediction")
pred_B = pd.Series(pred_cv_reg_B, name="prediction")
pred_C = pd.Series(pred_cv_reg_C, name="prediction")

# Concatenate to single prediction
pred = pd.concat([pred_A, pred_B, pred_C], ignore_index=True)

# Create an 'id' column
pred = pred.reset_index()
pred.rename(columns={'index': 'id'}, inplace=True)

# Save to csv
pred.to_csv('./pred.csv', index=False, header=True)


In [98]:
# Verifications
pred.describe()

Unnamed: 0,id,prediction
count,2160.0,2160.0
mean,1079.5,1111.094644
std,623.682612,1166.386639
min,0.0,1.745925
25%,539.75,475.217932
50%,1079.5,532.69125
75%,1619.25,2543.1691
max,2159.0,3189.4456
