In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [3]:
df = pd.read_csv('Pasion et al dataset.csv')

In [4]:
# Encode location data
df_with_location_en = pd.get_dummies(df, columns=['Location'], drop_first=True)

In [5]:
# Encode season data
df_with_loc_season_en = pd.get_dummies(df_with_location_en, columns=['Season'], drop_first=True)

In [6]:
min_hour_of_interest = 10
max_hour_of_interest = 15

In [7]:
df_with_loc_season_en['delta_hr']= df_with_loc_season_en.Hour - min_hour_of_interest

In [8]:
# Create Cyclic date features
df_with_loc_season_en['sine_mon']= np.sin((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['cos_mon']= np.cos((df_with_loc_season_en.Month - 1)*np.pi/11)
df_with_loc_season_en['sine_hr']= np.sin((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))
df_with_loc_season_en['cos_hr']= np.cos((df_with_loc_season_en.delta_hr*np.pi/(max_hour_of_interest - min_hour_of_interest)))

In [9]:
selected_columns = ['Latitude', 'Humidity', 'AmbientTemp', 'PolyPwr', 'Wind.Speed',
                     'Visibility', 'Pressure', 'Cloud.Ceiling', 'Location_Grissom',
                     'Location_Hill Weber', 'Location_JDMT', 'Location_Kahului',
                     'Location_MNANG', 'Location_Malmstrom', 'Location_March AFB',
                     'Location_Offutt', 'Location_Peterson', 'Location_Travis',
                     'Location_USAFA','Season_Spring', 'Season_Summer', 'Season_Winter',
                     'sine_mon', 'cos_mon', 'sine_hr', 'cos_hr']

In [10]:
df_processed = df_with_loc_season_en[selected_columns].reset_index(drop=True)

In [11]:
target_label = 'PolyPwr'

In [12]:
input_feat = list(set(selected_columns).difference(set([target_label])))

In [13]:
df_X = df_processed[input_feat].reset_index(drop=True)

In [14]:
df_y = df_processed[target_label]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [16]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [17]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

In [18]:
param_grid = {
    'svr__kernel': ['linear', 'poly', 'rbf','sigmoid','precomputed','callable'],
    'svr__C': [0.1, 1.0, 10.0],
    'svr__epsilon': [0.01, 0.1, 0.2],
}

In [19]:
svm_random = RandomizedSearchCV(pipeline, param_grid, n_iter=1000, cv=4, verbose=2, random_state=42, 
                                n_jobs = -1)

In [20]:
svm_random.fit(X_train, y_train)

Fitting 4 folds for each of 54 candidates, totalling 216 fits


72 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/rishisankhe/Library/Python/3.9/lib/python/site-p

In [21]:
svm_random.best_params_

{'svr__kernel': 'rbf', 'svr__epsilon': 0.2, 'svr__C': 10.0}

In [1]:
from sklearn.metrics import mean_squared_error
mean_squared_error(svm_random.predict(X_test), y_test)

NameError: name 'svm_random' is not defined