In [68]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [69]:
df = pd.read_csv('Pasion et al dataset.csv')

In [70]:
# Encode location data
df_with_location_en = pd.get_dummies(df, columns=['Location'], drop_first=True)

In [71]:
# Encode season data
df_with_loc_season_en = pd.get_dummies(df_with_location_en, columns=['Season'], drop_first=True)

In [72]:
min_hour_of_interest = 10
max_hour_of_interest = 15

In [73]:
df_with_loc_season_en['delta_hr']= df_with_loc_season_en.Hour - min_hour_of_interest

In [74]:
# Create Cyclic date features
df_with_loc_season_en['sine_mon']= np.sin((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['cos_mon']= np.cos((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['sine_hr']= np.sin((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))
df_with_loc_season_en['cos_hr']= np.cos((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))

In [75]:
selected_columns = ['Latitude', 'Humidity', 'AmbientTemp', 'PolyPwr', 'Wind.Speed',
                     'Visibility', 'Pressure', 'Cloud.Ceiling', 'Location_Grissom',
                     'Location_Hill Weber', 'Location_JDMT', 'Location_Kahului',
                     'Location_MNANG', 'Location_Malmstrom', 'Location_March AFB',
                     'Location_Offutt', 'Location_Peterson', 'Location_Travis',
                     'Location_USAFA','Season_Spring', 'Season_Summer', 'Season_Winter',
                     'sine_mon', 'cos_mon', 'sine_hr', 'cos_hr']

In [76]:
df_processed = df_with_loc_season_en[selected_columns].reset_index(drop=True)

In [77]:
target_label = 'PolyPwr'

In [78]:
input_feat = list(set(selected_columns).difference(set([target_label])))

In [79]:
df_X = df_processed[input_feat].reset_index(drop=True)

In [80]:
df_y = df_processed[target_label]

In [81]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [82]:
from sklearn.ensemble import RandomForestRegressor

In [83]:
rf_base = RandomForestRegressor(random_state=42)

In [84]:
# Create list of each hyper-param to tune
n_estimators_list = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]

max_features_list = ['auto', 'sqrt']

max_depth_list = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth_list.append(None)

min_samples_split_list = [2, 5, 10]

min_samples_leaf_list = [1, 2, 4]

bootstrap_list = [True, False]
# Structure model hyper-params as a dictionary
rf_grid = {'n_estimators': n_estimators_list,
           'max_features': max_features_list,
           'max_depth': max_depth_list,
           'min_samples_split': min_samples_split_list,
           'min_samples_leaf': min_samples_leaf_list,
           'bootstrap': bootstrap_list}
# Create base LGBM model
rf_base = RandomForestRegressor(random_state=42)
# Create random search for LGBM model
rf_random = RandomizedSearchCV(estimator=rf_base, param_distributions=rf_grid, 
                                 n_iter=1000, cv=4, verbose=2, random_state=42, 
                                 n_jobs=-1)

In [86]:
rf_random.fit(X_train, y_train)

Fitting 4 folds for each of 1000 candidates, totalling 4000 fits
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1100; total time=  13.4s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1300; total time=  24.5s




KeyboardInterrupt: 

In [89]:
rf_random.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [90]:
model = RandomForestRegressor(n_estimators = 1900,min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt',max_depth = 40, bootstrap = True)

In [91]:
model.fit(X_train,y_train)

In [92]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(model.predict(X_test), y_test)

2.7745046475519244

In [93]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(model.predict(X_test), y_test))

4.094037592980423

In [95]:
y_pred = model.predict(X_test)

In [96]:
# R2 score
r2_score(y_test.ravel(), y_pred)

0.670059520531537