# Wind prediction - Second assignment

## Authors

David Moreno Maldonado 100441714     
Inés Fernández Campos 100443936

## 0. Preliminaries

In [1]:
# Import some libraries
import os
import numpy as np              
import pandas as pd
import matplotlib.pyplot as plt 

import sys
import time
import math

from sklearn.experimental import enable_iterative_imputer
from sklearn import preprocessing, impute, model_selection, metrics, neighbors, ensemble, feature_selection
from sklearn.pipeline import Pipeline
import optuna
import optuna.visualization as ov

os.getcwd()

'/home/fddcampos/Documents/uc3m/2_term/BDINTELLIGENCE/practicas_big_data_intelligence/assignment_2'

In [2]:
#MAIN PARAMETERS FOR THE ASSIGNMENT
budget = 20
random_state = 3
verbose = 0
n_jobs = 1

The "wind_pickle" file contains data in a binary format called "Pickle". Pickle data loads faster than text data.

In [3]:
data = pd.read_pickle('wind_pickle.pickle')

You can visualize the attributes in the dataset. Very important, the output attribute (i.e. the value to be predicted, **energy**, is the first attribute). **Steps** represents the hours in advance of the forecast. We will not use this variable here.

In [4]:
# The dataset contains 5937 instances and 556 attributes (including the outcome to be predicted)
print(data.shape)
#data.columns.values.tolist() 

(5937, 556)


In [5]:
#-1 for training, 0 for validation, 1 for testing
year_to_part = {
    2005: -1,
    2006: -1,
    2007: 0,
    2008: 0, 
    2009: 1,
    2010: 1
}
data['partition'] = data['year'].apply(lambda x: year_to_part[x])

We now remove the columns that cannot be used for training the models from the DataFrame

In [6]:
# Steps, month, day, hour, year should be removed, they cannot be used for training the models
to_remove = ['steps', 'month', 'year', 'day', 'hour']
for m in to_remove: data = data.drop(m, 1)

In [7]:
from numpy.random import randint

# we add na values at random
my_NIA = 100443936 + 100441714
np.random.seed(my_NIA)

how_many_nas = round(data.shape[0]*data.shape[1]*0.05)
print('Lets put '+str(how_many_nas)+' missing values \n')
x_locations = randint(0, data.shape[0], size=how_many_nas)
y_locations = randint(1, data.shape[1]-2, size=how_many_nas)

for i in range(len(x_locations)):
    data.iat[x_locations[i], y_locations[i]] = np.nan
    
data.to_pickle('wind_pickle_with_nan.pickle')

Lets put 163861 missing values 



From this point on, the file wind_pickle_with_nan should be used.

In [3]:
data = pd.read_pickle('wind_pickle_with_nan.pickle')
data.shape

(5937, 552)

## Input missing data

In [4]:
print(data.isnull().values.any())
input_cols = data.columns.difference(['energy', 'partition'])
x = data[input_cols]
#Iterative imputer (takes too long)
'''iter_imp = impute.IterativeImputer(random_state=random_state, 
                                   initial_strategy='median', 
                                   max_iter=3,
                                   verbose=verbose)
no_nan = iter_imp.fit_transform(x)'''

#KNN imputer(takes too long)
'''knn_imp = impute.KNNImputer(weights='distance')
no_nan = knn_imp.fit_transform(x)'''

#Simple imputer
simp_imp = impute.SimpleImputer(strategy='median',
                               verbose=2)
no_nan = simp_imp.fit_transform(x)

data[input_cols] = pd.DataFrame(data=no_nan)
print(data.isnull().values.any())

True
False


## Scaling

In [5]:
scaler = preprocessing.StandardScaler().fit(data[input_cols]) 
data[input_cols] = scaler.transform(data[input_cols])

## Data split
We are going to use train/test for model evaluation (outer) and train/validation for hyperparameter tuning (inner), as follows:     
1. Train partition: the first two years of data. Given that there are 6 years worth of data, we will use the first 2/6 of the instances for training.     
2. Validation partition: the second two years of data. 
3. Test partition: the remaining data    


In [6]:
#-1 for training, 0 for validation, 1 for testing
test = data[data['partition'] == 1]
train = data[data['partition'] == -1]
val = data[data['partition'] == 0]

del test['partition']
del train['partition']

y_test = test['energy']
x_test = test[test.columns.difference(['energy'])]

y_train = train['energy']
x_train = train[train.columns.difference(['energy'])]


y_val = val['energy']
x_val = val[train.columns.difference(['energy'])]

# 1. MODEL SELECTION AND HYPER-PARAMETER TUNING

In [24]:
#Dataframes with all the information of each model
summary = {
    'knn': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'N. neighbors', 'Weights', 'P']),
    'random_forest': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'Min. samples split', 'Criterion', 'Max. depth', 'N. estimators','Max. features']),
    'gradient_boosting': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)'])
}

## 1.1 KNN

### 1.1.1 Default hyper-parameters

In [15]:
np.random.seed(random_state)
knn_default = neighbors.KNeighborsRegressor()

start_time = time.time()
knn_default = knn_default.fit(x_train, y_train)
y_val_pred = knn_default.predict(x_val)
score = math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))
end_time = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': score, 
    'N. neighbors': 5, 
    'Weights': 'uniform', 
    'P': 2
    }, 
    name='default'))

### 1.1.2 Hyper-parameter tunning (OPTUNA)

In [16]:
min_n_neigbors = 1
max_n_neigbors = 16

In [17]:
np.random.seed(random_state)
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', min_n_neigbors, max_n_neigbors)
    weights = trial.suggest_categorical('weights', ['uniform','distance'])
    p = trial.suggest_categorical('p', [1, 2])

    clf = neighbors.KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p)
    
    clf = clf.fit(x_train, y_train)
    y_val_pred = clf.predict(x_val)
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))

knn_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
knn_optuna.optimize(knn_objective, n_trials=budget)
end_time = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': knn_optuna.best_value, 
    'N. neighbors': knn_optuna.best_params['n_neighbors'], 
    'Weights': knn_optuna.best_params['weights'], 
    'P': knn_optuna.best_params['p']
    }, 
    name='optuna'))

[32m[I 2021-01-13 14:30:52,839][0m A new study created in memory with name: no-name-5643f019-8098-4f6e-a30e-e1b102ce75a1[0m
[32m[I 2021-01-13 14:30:55,885][0m Trial 0 finished with value: 461.50247231150104 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'p': 2}. Best is trial 0 with value: 461.50247231150104.[0m
[32m[I 2021-01-13 14:30:59,358][0m Trial 1 finished with value: 425.8532193925109 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 425.8532193925109.[0m
[32m[I 2021-01-13 14:31:02,432][0m Trial 2 finished with value: 446.53181088677667 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 425.8532193925109.[0m
[32m[I 2021-01-13 14:31:05,247][0m Trial 3 finished with value: 438.66704912167074 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 425.8532193925109.[0m
[32m[I 2021-01-13 14:31:08,229][0m Trial 4 finished with va

## 1.2 Random Forest

### 1.2.1 Default hyper-parameters

In [20]:
np.random.seed(random_state)
rf_default = ensemble.RandomForestRegressor(random_state=random_state, verbose=verbose, n_jobs=n_jobs)

start_time = time.time()
rf_default = rf_default.fit(x_train, y_train)
y_val_pred = rf_default.predict(x_val)
score =  math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))
end_time = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': score,
    'Min. samples split': 2, 
    'Criterion': 'mse', 
    'Max. depth': 'None',
    'N. estimators': 100,
    'Max. features': 1
    },
    name='default'))

### 1.2.2 Hyper-parameter tunning (OPTUNA)

In [21]:
min_max_depth = 2
max_max_depth = 50
min_n_estimators = 50
max_n_estimators = 200

In [22]:
np.random.seed(random_state)
def random_forest_objective(trial):
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    criterion = trial.suggest_categorical('criterion', ['mse','mae'])
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    clf = ensemble.RandomForestRegressor(
        random_state=random_state,
        min_samples_split=min_samples_split,
        criterion=criterion,
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features
        )
    clf = clf.fit(x_train, y_train)
    y_val_pred = clf.predict(x_val)
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))

rf_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
rf_optuna.optimize(random_forest_objective, n_trials=budget, n_jobs=n_jobs)
end_time = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': rf_optuna.best_value,
    'Min. samples split': rf_optuna.best_params['min_samples_split'], 
    'Criterion': rf_optuna.best_params['criterion'], 
    'Max. depth': rf_optuna.best_params['max_depth'],
    'N. estimators': rf_optuna.best_params['n_estimators'],
    'Max. features': rf_optuna.best_params['max_features']
    },
    name='optuna'))

[32m[I 2021-01-13 14:40:46,203][0m A new study created in memory with name: no-name-2663b009-5998-4065-8d0e-10ee2f47a415[0m
[32m[I 2021-01-13 14:40:48,116][0m Trial 0 finished with value: 400.9598210339549 and parameters: {'min_samples_split': 0.2585716150647326, 'criterion': 'mse', 'max_depth': 21, 'n_estimators': 119, 'max_features': 'sqrt'}. Best is trial 0 with value: 400.9598210339549.[0m
[32m[I 2021-01-13 14:40:48,275][0m Trial 1 finished with value: 668.3836578998909 and parameters: {'min_samples_split': 0.7275958768930648, 'criterion': 'mse', 'max_depth': 19, 'n_estimators': 125, 'max_features': 'sqrt'}. Best is trial 0 with value: 400.9598210339549.[0m
[32m[I 2021-01-13 14:40:48,379][0m Trial 2 finished with value: 668.4502497483671 and parameters: {'min_samples_split': 0.9734866223139015, 'criterion': 'mse', 'max_depth': 19, 'n_estimators': 83, 'max_features': 'sqrt'}. Best is trial 0 with value: 400.9598210339549.[0m
[32m[I 2021-01-13 14:40:48,521][0m Trial 3 f

## 1.3 Gradient Boosting

### 1.3.1 Default hyper-parameters

In [28]:
# implementation using sklearn
np.random.seed(random_state)
gb_sk_def = ensemble.GradientBoostingRegressor(random_state=random_state, verbose=verbose)

start_time = time.time()
gb_sk_def = gb_sk_def.fit(x_train, y_train)
y_val_pred = gb_sk_def.predict(x_val)
score =  math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': score,
    'Learning rate': 0.1,
    'N. estimators': 100,
    'Criterion': 'friedman_mse', 
    'Min. samples split': 2, 
    'Min. samples leaf': 1,
    'Max. depth': 3,
    'Max. leaf nodes': 'None'
    },
    name='default'))

In [29]:
# implementation using xgboost
import xgboost as xgb

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

gb_xgb_def = xgb.XGBRegressor(objective='reg:squarederror')

start_time = time.time()
gb_xgb_def = gb_xgb_def.fit(x_train, y_train)
y_val_pred = gb_xgb_def.predict(x_val)
score = math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': score,
    'Learning rate': 0.3,
    'Max. depth': 6,
    'Max. leaf nodes': 0,
    'Gamma (min_split_loss)': 0,
    'Lambda': 1,
    'Alpha': 0,
    'N. estimators': gb_xgb_def.get_params()['n_estimators']
    },
    name='default_xgboost'))

### 1.3.2 Hyper-parameter tunning

In [30]:
#print(dir(dtrain))
#print('\n', dir(model))
min_max_leaf_nodes = 2
max_max_leaf_nodes = 20
min_min_samples_leaf = 1
max_min_samples_leaf = 10

In [31]:
# hyperparam tuning for sklearn ensemble.GradientBoostingRegressor
np.random.seed(random_state)

def gradboosting_objective(trial):  
    gb_sk_opt = None
    short = False
    
    learning_rate = trial.suggest_uniform('learning_rate', 0+sys.float_info.min, 1)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
        
    if short == False: # it will take a long time to run 
        criterion = trial.suggest_categorical('criterion', ['mse','friedman_mse'])
        min_samples_leaf = trial.suggest_int('min_samples_leaf',min_min_samples_leaf, max_min_samples_leaf)
        max_leaf_nodes = trial.suggest_int('max_leaf_nodes', min_max_leaf_nodes, max_max_leaf_nodes)
        
        gb_sk_opt = ensemble.GradientBoostingRegressor(learning_rate=learning_rate, 
                                                   n_estimators=n_estimators,
                                                   criterion=criterion,
                                                   min_samples_split=min_samples_split,
                                                   min_samples_leaf=min_samples_leaf,
                                                   max_depth=max_depth,
                                                   max_leaf_nodes=max_leaf_nodes,
                                                   random_state=random_state,
                                                   verbose=verbose)
    else:  # will take less time        
        gb_sk_opt = ensemble.GradientBoostingRegressor(learning_rate=learning_rate, 
                                                   n_estimators=n_estimators,
                                                   min_samples_split=min_samples_split,
                                                   max_depth=max_depth,
                                                   random_state=random_state,
                                                   verbose=verbose)
        
    gb_sk_opt = gb_sk_opt.fit(x_train, y_train)
    y_val_pred = gb_sk_opt.predict(x_val)
    
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))

gb_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
gb_optuna.optimize(gradboosting_objective, n_trials=budget)
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': gb_optuna.best_value,
    'Learning rate': gb_optuna.best_params['learning_rate'],
    'N. estimators': gb_optuna.best_params['n_estimators'],
    'Criterion': 'friedman_mse', 
    #Min. samples split': gb_optuna.best_params['min_samples_split'], 
    'Min. samples leaf': gb_optuna.best_params['min_samples_leaf'],
    'Min. samples leaf': 1,
    'Max. depth': gb_optuna.best_params['max_depth'],
    #Max. leaf nodes': 'None'
    'Max. leaf nodes': gb_optuna.best_params['max_leaf_nodes']
    },
    name='optuna_sklearn'))

[32m[I 2021-01-13 17:33:44,119][0m A new study created in memory with name: no-name-85840a61-4374-4c5d-87c8-f6402e3d33fa[0m
[32m[I 2021-01-13 17:34:20,446][0m Trial 0 finished with value: 426.3910818511826 and parameters: {'learning_rate': 0.9437204360706714, 'n_estimators': 179, 'min_samples_split': 0.9904165055278342, 'max_depth': 4, 'criterion': 'friedman_mse', 'min_samples_leaf': 5, 'max_leaf_nodes': 5}. Best is trial 0 with value: 426.3910818511826.[0m
[32m[I 2021-01-13 17:36:14,804][0m Trial 1 finished with value: 451.73522494764103 and parameters: {'learning_rate': 0.8426704875565854, 'n_estimators': 174, 'min_samples_split': 0.6297387035925248, 'max_depth': 6, 'criterion': 'friedman_mse', 'min_samples_leaf': 10, 'max_leaf_nodes': 15}. Best is trial 0 with value: 426.3910818511826.[0m
[32m[I 2021-01-13 17:36:52,655][0m Trial 2 finished with value: 471.6245212069269 and parameters: {'learning_rate': 0.9957578627484363, 'n_estimators': 96, 'min_samples_split': 0.8946037

In [32]:
# hyperparam tuning for XGBoost Regressor
def xgradboosting_objective(trial):
    
    eta = trial.suggest_uniform('eta', 0+sys.float_info.min, 1.0)
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    
    gamma = trial.suggest_float('gamma', 0.01, 1.0)
    reg_lambda = trial.suggest_uniform('lambda', 0.01, 0.5)
    reg_alpha = trial.suggest_uniform('alpha', 0.01, 0.5)

    gb_xgb_opt = xgb.XGBRegressor(objective='reg:squarederror',
                                  booster='gbtree',
                                  learning_rate=eta,
                                  gamma=gamma,
                                  reg_alpha=reg_alpha,
                                  reg_lambda=reg_lambda,
                                  max_depth=max_depth,
                                  n_estimators=n_estimators,
                                  random_state=random_state,
                                  verbosity=verbose
                                 )

    gb_xgb_opt = gb_xgb_opt.fit(x_train, y_train)
    y_val_pred = gb_xgb_opt.predict(x_val)
    
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))


gb_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
gb_optuna.optimize(xgradboosting_objective, n_trials=budget)
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': gb_optuna.best_value,
    'Learning rate': gb_optuna.best_params['eta'],
    'Max. depth': gb_optuna.best_params['max_depth'],
    'Gamma (min_split_loss)': gb_optuna.best_params['gamma'],
    'Lambda': gb_optuna.best_params['lambda'],
    'Alpha': gb_optuna.best_params['alpha'],
    'N. estimators': gb_optuna.best_params['n_estimators']  
    },
    name='optuna_xgboost'))

[32m[I 2021-01-13 18:00:31,508][0m A new study created in memory with name: no-name-665d982e-98ea-4bc4-b946-dd024ab898e8[0m
[32m[I 2021-01-13 18:00:43,128][0m Trial 0 finished with value: 407.8463263806792 and parameters: {'eta': 0.01578049329785458, 'max_depth': 3, 'n_estimators': 138, 'gamma': 0.6971905601309071, 'lambda': 0.3811080738096944, 'alpha': 0.20120626381188714}. Best is trial 0 with value: 407.8463263806792.[0m
[32m[I 2021-01-13 18:00:52,135][0m Trial 1 finished with value: 385.58132285995805 and parameters: {'eta': 0.11756590681846768, 'max_depth': 2, 'n_estimators': 161, 'gamma': 0.1854233470219595, 'lambda': 0.06834635874698405, 'alpha': 0.4742807662973112}. Best is trial 1 with value: 385.58132285995805.[0m
[32m[I 2021-01-13 18:01:04,613][0m Trial 2 finished with value: 421.71825083991206 and parameters: {'eta': 0.4634993828570153, 'max_depth': 4, 'n_estimators': 92, 'gamma': 0.5211171901404208, 'lambda': 0.25973284204659014, 'alpha': 0.06487483813687336}. B

In [21]:
summary['knn']

Unnamed: 0,Time (sec),Score (RMSE),N. neighbors,Weights,P
default,0.1176,455.123868,5,uniform,2
optuna,33.7278,424.95488,11,distance,1


In [48]:
summary['random_forest']

Unnamed: 0,Time (sec),Score (RMSE),Min. samples split,Criterion,Max. depth,N. estimators,Max. features
default,82.5335,375.560721,2.0,mse,,100,1
optuna,233.6654,374.129312,0.00872,mse,,171,0.667976
optuna,1031.8257,373.977008,0.00742,mae,,101,sqrt


In [55]:
summary['gradient_boosting'].sort_index(ascending=True)

Unnamed: 0,Time (sec),Score (RMSE),Alpha,Gamma (min_split_loss),Lambda,Learning rate,Max. depth,N. estimators,Criterion,Max. leaf nodes,Min. samples leaf,Min. samples split
default,51.805,389.223359,,,,0.1,3.0,100.0,friedman_mse,,1.0,2.0
default_xgboost,12.0105,409.80287,0.0,0.0,1.0,0.3,6.0,100.0,,0.0,,
optuna_sklearn,1600.0196,373.503153,,,,0.130121,10.0,60.0,friedman_mse,9.0,1.0,
optuna_xgboost,275.7636,384.920573,0.161185,0.745587,0.202009,0.068384,2.0,181.0,,,,


In [56]:
#Dummy regressor(mean)
math.sqrt(metrics.mean_squared_error(y_val, [y_val.mean() for i in range(len(y_val))]))

666.6691142412727

# 2. ATTRIBUTE SELECTION

## 2.1 Select from all attributes

**Are all 550 input attributes actually necessary in order to get a good model? Is it possible to have an accurate model that uses fewer than 550 variables? How many?**

For this question we will be using the best model we had in previous section and now include the parameter for select only certain attributes.

In [57]:
min_max_depth = 2
max_max_depth = 15
min_n_estimators = 50
max_n_estimators = 200
min_n_k = 10
max_n_k = 30

In [13]:
np.random.seed(random_state)
def random_forest_objective_attr(trial):
    k = trial.suggest_int('k', min_n_k, max_n_k)
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    criterion = trial.suggest_categorical('criterion', ['mse','mae'])
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth, log=True)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    clf = Pipeline([
      ('feature_selection', feature_selection.SelectKBest(feature_selection.f_regression, k=k)),
      ('regression', ensemble.RandomForestRegressor(
          random_state=random_state,
          min_samples_split=min_samples_split,
          criterion=criterion,
          max_depth=max_depth,
          n_estimators=n_estimators,
          max_features=max_features
      ))
    ])

    clf = clf.fit(x_train, y_train)
    y_val_pred = clf.predict(x_val)
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))

rf_attr_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
rf_attr_optuna.optimize(random_forest_objective_attr, n_trials=budget, n_jobs=n_jobs)
end_time = time.time()
print(end_time-start_time)

[32m[I 2021-01-13 13:59:52,067][0m A new study created in memory with name: no-name-f677b493-b3e6-48bb-87d6-43f34315514d[0m
[32m[I 2021-01-13 14:00:57,538][0m Trial 0 finished with value: 444.03711882821585 and parameters: {'k': 59, 'min_samples_split': 0.31820927889533557, 'criterion': 'mae', 'max_depth': 15, 'n_estimators': 77, 'max_features': 'auto'}. Best is trial 0 with value: 444.03711882821585.[0m
[32m[I 2021-01-13 14:00:58,050][0m Trial 1 finished with value: 719.5022353639617 and parameters: {'k': 51, 'min_samples_split': 0.6987448939434953, 'criterion': 'mae', 'max_depth': 4, 'n_estimators': 176, 'max_features': 'auto'}. Best is trial 0 with value: 444.03711882821585.[0m
[32m[I 2021-01-13 14:00:58,225][0m Trial 2 finished with value: 668.4403422905759 and parameters: {'k': 188, 'min_samples_split': 0.8572131246824229, 'criterion': 'mse', 'max_depth': 2, 'n_estimators': 106, 'max_features': 'auto'}. Best is trial 0 with value: 444.03711882821585.[0m
[32m[I 2021-01

587.7053737640381


In [14]:
#TODO: Conclusions
print(rf_attr_optuna.best_params, rf_attr_optuna.best_value)

{'k': 273, 'min_samples_split': 0.3102503141152435, 'criterion': 'mae', 'max_depth': 9, 'n_estimators': 145, 'max_features': 'sqrt'} 413.210766994515


## 2.2 Use only Sotavento attributes
**Is it enough to use only the attributes for the actual Sotavento location? (13th location in the grid)**

We will select only Sotavento attributes and use the best model in previous section to train a model.

In [15]:
sot_attr = []
for attr in x_train.columns:
    if int(attr.split('.')[-1]) == 13:
        sot_attr.append(attr)

x_train_sot = x_train[sot_attr]
x_val_sot = x_val[sot_attr]
x_test_sot = x_test[sot_attr]
print(x_train_sot.shape,x_val_sot.shape,x_test_sot.shape)

(2528, 22) (1299, 22) (2110, 22)


In [14]:
np.random.seed(random_state)
def random_forest_sot_objective(trial):
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    criterion = trial.suggest_categorical('criterion', ['mse','mae'])
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    clf = ensemble.RandomForestRegressor(
        random_state=random_state,
        min_samples_split=min_samples_split,
        criterion=criterion,
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features
        )

    clf = clf.fit(x_train_sot, y_train)
    y_val_pred = clf.predict(x_val_sot)
    return math.sqrt(metrics.mean_squared_error(y_val, y_val_pred))

rf_sot_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
rf_sot_optuna.optimize(random_forest_sot_objective, n_trials=budget, n_jobs=n_jobs)
end_time = time.time()

[32m[I 2021-01-13 16:34:56,851][0m A new study created in memory with name: no-name-f4f69465-2c2a-4495-bc8d-3e9ad3f30b8f[0m
[33m[W 2021-01-13 16:34:56,861][0m Trial 0 failed because of the following error: NameError("name 'x_train_sot' is not defined")
Traceback (most recent call last):
  File "/home/fddcampos/.local/lib/python3.8/site-packages/optuna/_optimize.py", line 189, in _run_trial
    value = func(trial)
  File "<ipython-input-14-c360c6436cc6>", line 18, in random_forest_sot_objective
    clf = clf.fit(x_train_sot, y_train)
NameError: name 'x_train_sot' is not defined[0m


NameError: name 'x_train_sot' is not defined

In [None]:
#TODO: Conclusions
print(rf_sot_optuna.best_params, rf_sot_optuna.best_value)