In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
import xgboost as xgb

In [2]:
df = pd.read_csv('Pasion et al dataset.csv')

In [3]:
# Encode location data
df_with_location_en = pd.get_dummies(df, columns=['Location'], drop_first=True)

# Encode season data
df_with_loc_season_en = pd.get_dummies(df_with_location_en, columns=['Season'], drop_first=True)

min_hour_of_interest = 10
max_hour_of_interest = 15

df_with_loc_season_en['delta_hr']= df_with_loc_season_en.Hour - min_hour_of_interest

# Create Cyclic date features
df_with_loc_season_en['sine_mon']= np.sin((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['cos_mon']= np.cos((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['sine_hr']= np.sin((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))
df_with_loc_season_en['cos_hr']= np.cos((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))

In [4]:
selected_columns = ['Latitude', 'Humidity', 'AmbientTemp', 'PolyPwr', 'Wind.Speed',
                     'Visibility', 'Pressure', 'Cloud.Ceiling', 'Location_Grissom',
                     'Location_Hill Weber', 'Location_JDMT', 'Location_Kahului',
                     'Location_MNANG', 'Location_Malmstrom', 'Location_March AFB',
                     'Location_Offutt', 'Location_Peterson', 'Location_Travis',
                     'Location_USAFA','Season_Spring', 'Season_Summer', 'Season_Winter',
                     'sine_mon', 'cos_mon', 'sine_hr', 'cos_hr']

In [5]:
df_processed = df_with_loc_season_en[selected_columns].reset_index(drop=True)

In [6]:
target_label = 'PolyPwr'

In [7]:
input_feat = list(set(selected_columns).difference(set([target_label])))

In [8]:
df_X = df_processed[input_feat].reset_index(drop=True)

In [9]:
df_y = df_processed[target_label]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [11]:
base_estimator = DecisionTreeClassifier()

In [30]:
#Create lists for hyperparameter tuning

loss_list = ['linear', 'exponential', 'square']
n_estimators_list = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]          # Number of boosting rounds
learning_rate_list = np.linspace(0.01, 0.3, 100)
base_estimator_list =  [None, DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
algorithm_list = ['SAMME','SAMME.R']

In [48]:
ada_grid = {
    'n_estimators': n_estimators_list,
    'learning_rate': learning_rate_list,
    'loss': loss_list,
    'estimator': base_estimator_list
                }

In [49]:
ada_base = AdaBoostRegressor()

In [50]:
ada_base.get_params()

{'base_estimator': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [44]:
ada_random = RandomizedSearchCV(estimator=ada_base, param_distributions=ada_grid, 
                                 n_iter=1000, cv=4, verbose=2, random_state=42, 
                                 n_jobs=-1)

In [45]:
ada_random.fit(X_train,y_train)

Fitting 4 folds for each of 1000 candidates, totalling 4000 fits
[CV] END algorithm=SAMME.R, estimator=None, learning_rate=0.08030303030303029, n_estimators=1500; total time=   0.0s
[CV] END algorithm=SAMME, estimator=None, learning_rate=0.06272727272727271, n_estimators=100; total time=   0.0s
[CV] END algorithm=SAMME.R, estimator=DecisionTreeClassifier(max_depth=1), learning_rate=0.2531313131313131, n_estimators=400; total time=   0.0s
[CV] END algorithm=SAMME, estimator=None, learning_rate=0.07737373737373736, n_estimators=900; total time=   0.0s
[CV] END algorithm=SAMME, estimator=None, learning_rate=0.07737373737373736, n_estimators=900; total time=   0.0s
[CV] END algorithm=SAMME.R, estimator=None, learning_rate=0.1623232323232323, n_estimators=800; total time=   0.0s
[CV] END algorithm=SAMME.R, estimator=None, learning_rate=0.1623232323232323, n_estimators=800; total time=   0.0s
[CV] END algorithm=SAMME.R, estimator=DecisionTreeClassifier(max_depth=1), learning_rate=0.197474747

ValueError: 
All the 4000 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2072 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 171, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 582, in _boost
    return self._boost_discrete(iboost, X, y, sample_weight, random_state)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 647, in _boost_discrete
    estimator.fit(X, y, sample_weight=sample_weight)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 959, in fit
    super()._fit(
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 284, in _fit
    check_classification_targets(y)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/utils/multiclass.py", line 215, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

--------------------------------------------------------------------------------
1928 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 171, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 579, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/ensemble/_weight_boosting.py", line 588, in _boost_real
    estimator.fit(X, y, sample_weight=sample_weight)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 959, in fit
    super()._fit(
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 284, in _fit
    check_classification_targets(y)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/utils/multiclass.py", line 215, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.


In [41]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(ada_random.predict(X_test), y_test))

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [27]:
ada_random.best_params_

{'n_estimators': 100,
 'loss': 'exponential',
 'learning_rate': 0.05686868686868687,
 'estimator': None}