In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector as SFS
import xgboost as xgb

In [2]:
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet').drop(columns='date_forecast')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet').drop(columns='date_forecast')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet').drop(columns='date_forecast')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet').drop(columns='date_forecast')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet').drop(columns='date_forecast')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet').drop(columns='date_forecast')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet').drop(columns='date_forecast')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet').drop(columns='date_forecast')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet').drop(columns='date_forecast')

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Split the data
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, random_state=42)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, random_state=42)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, test_size=0.2, random_state=42)

In [3]:
# Initialize the xgboost model
parameters = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 10, 
    'min_child_weight': 10, 
    'n_estimators': 350, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'random_state': 42
}
xgb_model = xgb.XGBRegressor(**parameters)

# Initialize forward selection
sfs = SFS(estimator=xgb_model, 
          n_features_to_select='auto', 
          scoring='neg_mean_squared_error', # or another scoring function
          cv=5,
          direction='forward',
          n_jobs=-1)

In [4]:
# Perform SFS
sfs = sfs.fit(X_A, y_A)

In [5]:
# Assuming `sfs` is your Sequential Feature Selector object
# and `X_train_A` is a pandas DataFrame of your training data

# Get the boolean mask of the selected features
selected_features_mask = sfs.get_support()

# Get the names of the selected features
selected_features = X_train_A.columns[selected_features_mask]
print(f'Selected features: {selected_features}')


"""
Best features for A:

['total_radiation', 'snow_accumulation', 'average_wind_speed',
'clear_sky_rad:W', 'month', 't_1000hPa:C', 'effective_cloud_cover:p',
'hour', 'year', 'temp_dewpoint_diff']
"""

Selected features: Index(['total_radiation', 'snow_accumulation', 'average_wind_speed',
       'clear_sky_rad:W', 'month', 't_1000hPa:C', 'effective_cloud_cover:p',
       'hour', 'year', 'temp_dewpoint_diff'],
      dtype='object')


In [6]:
# Fit the model with selected features
# xgb_model.fit(X_train_A[selected_features], y_train_A)

In [7]:
# Evaluate the model
# y_pred_A = xgb_model.predict(X_test_A[selected_features])
# mae = mean_absolute_error(y_test_A, y_pred_A)
# print(f'Test MAE: {mae}')

In [8]:
# Initialize the xgboost model
parameters = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 10, 
    'min_child_weight': 10, 
    'n_estimators': 350, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'random_state': 42
}
xgb_model = xgb.XGBRegressor(**parameters)

# Initialize forward selection
sfs = SFS(estimator=xgb_model, 
          n_features_to_select='auto', 
          scoring='neg_mean_squared_error', # or another scoring function
          cv=5,
          direction='backward',
          n_jobs=-1)

In [9]:
# Perform SFS
sfs = sfs.fit(X_A, y_A)

In [10]:
# Assuming `sfs` is your Sequential Feature Selector object
# and `X_train_A` is a pandas DataFrame of your training data

# Get the boolean mask of the selected features
selected_features_mask = sfs.get_support()

# Get the names of the selected features
selected_features = X_train_A.columns[selected_features_mask]
print(f'Selected features: {selected_features}')

"""
Best features for A:

['total_radiation', 'snow_accumulation', 'average_wind_speed',
'clear_sky_rad:W', 't_1000hPa:C', 'effective_cloud_cover:p', 'hour',
'total_cloud_cover:p', 'year', 'temp_dewpoint_diff']

"""

Selected features: Index(['total_radiation', 'snow_accumulation', 'average_wind_speed',
       'clear_sky_rad:W', 't_1000hPa:C', 'effective_cloud_cover:p', 'hour',
       'total_cloud_cover:p', 'year', 'temp_dewpoint_diff'],
      dtype='object')


In [11]:
# Fit the model with selected features
# xgb_model.fit(X_train_A[selected_features], y_train_A)

In [12]:
# Evaluate the model
# y_pred_A = xgb_model.predict(X_test_A[selected_features])
# mae = mean_absolute_error(y_test_A, y_pred_A)
# print(f'Test MAE: {mae}')