In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df = pd.read_csv('Pasion et al dataset.csv')

In [3]:
# Encode location data
df_with_location_en = pd.get_dummies(df, columns=['Location'], drop_first=True)

In [4]:
# Encode season data
df_with_loc_season_en = pd.get_dummies(df_with_location_en, columns=['Season'], drop_first=True)

In [5]:
min_hour_of_interest = 10
max_hour_of_interest = 15

In [6]:
df_with_loc_season_en['delta_hr']= df_with_loc_season_en.Hour - min_hour_of_interest

In [7]:
# Create Cyclic date features
df_with_loc_season_en['sine_mon']= np.sin((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['cos_mon']= np.cos((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['sine_hr']= np.sin((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))
df_with_loc_season_en['cos_hr']= np.cos((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))

In [8]:
selected_columns = ['Latitude', 'Humidity', 'AmbientTemp', 'PolyPwr', 'Wind.Speed',
                     'Visibility', 'Pressure', 'Cloud.Ceiling', 'Location_Grissom',
                     'Location_Hill Weber', 'Location_JDMT', 'Location_Kahului',
                     'Location_MNANG', 'Location_Malmstrom', 'Location_March AFB',
                     'Location_Offutt', 'Location_Peterson', 'Location_Travis','Location_USAFA',
                    'Season_Spring', 'Season_Summer', 'Season_Winter',
                     'sine_mon', 'cos_mon', 'sine_hr', 'cos_hr']

In [9]:
df_processed = df_with_loc_season_en[selected_columns].reset_index(drop=True)

In [10]:
target_label = 'PolyPwr'

In [11]:
input_feat = list(set(selected_columns).difference(set([target_label])))

In [12]:
df_X = df_processed[input_feat].reset_index(drop=True)

In [13]:
df_y = df_processed[target_label]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [15]:
xg_best_params = {'subsample': 0.6818181818181819,
 'scale_pos_weight': 1,
 'sampling_method': 'uniform',
 'n_estimators': 500,
 'min_split_loss': 0.5,
 'min_child_weight': 10,
 'max_depth': 8,
 'learning_rate': 0.018787878787878787,
 'lambda': 0.1,
 'colsample_bytree': 0.7323232323232324,
 'alpha': 0.1}

In [16]:
xg_model = xgb.XGBRegressor(**xg_best_params)

In [17]:
rf_best_params = {'max_depth':40, 'max_features':'sqrt', 'min_samples_split':5,
                      'n_estimators':1900}

In [18]:
rf_model = RandomForestRegressor(**rf_best_params)

In [19]:
# Scale training data
svm_estimators = []
svm_estimators.append(('standardscaler', StandardScaler()))
# estimators.append(('minmax', MinMaxScaler()))
svm_estimators.append(('svr', SVR(epsilon=0.2)))
svm_pipeline = Pipeline(svm_estimators)

In [20]:
svm_best_params = {'svr__kernel': 'poly', 'svr__epsilon': 1.0, 'svr__C': 160.0}

In [21]:
svm_model = svm_pipeline.set_params(**svm_best_params)


In [22]:
ada_best_params = {'n_estimators': 100,
 'loss': 'exponential',
 'learning_rate': 0.05686868686868687,
 'estimator': None}

In [23]:
ada_model = AdaBoostRegressor(**ada_best_params)

In [24]:
# Define the base models
base0 = list()
base0.append(('xg', xg_model))
base0.append(('rf', rf_model))
base0.append(('svm', svm_model))
base0.append(('ada',ada_model))

In [25]:
# Define meta learner model
base1 = GradientBoostingRegressor()

In [26]:
# Define the stacking ensemble
stacked_model = StackingRegressor(estimators=base0, final_estimator=base1, cv=4, passthrough=True)

In [27]:
# Fit the model on the training data
stacked_model.fit(X_train, y_train)

In [28]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(stacked_model.predict(X_test), y_test)

2.669414603135988

In [29]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(stacked_model.predict(X_test), y_test))

4.032559524120524

In [30]:
y_pred = stacked_model.predict(X_test)

In [31]:
# R2 score
r2_score(y_test.ravel(), y_pred)

0.6798942155261791