# Wind prediction - Second assignment

## Authors

David Moreno Maldonado 100441714     
Inés Fernández Campos 100443936

## 0. Preliminaries

In [9]:
# Import some libraries
import os
import numpy as np              
import pandas as pd
import matplotlib.pyplot as plt 

import sys
import time
import math

from sklearn.experimental import enable_iterative_imputer
from sklearn import preprocessing, impute, model_selection, neighbors, ensemble
import optuna
import optuna.visualization as ov

os.getcwd()

'/home/fddcampos/Documents/uc3m/2_term/BDINTELLIGENCE/practicas_big_data_intelligence/assignment_2'

In [10]:
#MAIN PARAMETERS FOR THE ASSIGNMENT
budget = 20
random_state = 0
verbose = 0

#PARAMETERS FOR THE HYPER-PARAMETER TUNNING
#KNN
min_n_neigbors = 1
max_n_neigbors = 16

#RANDOM FOREST
min_max_depth = 2
max_max_depth = 20
min_n_estimators = 50
max_n_estimators = 200

The "wind_pickle" file contains data in a binary format called "Pickle". Pickle data loads faster than text data.

In [4]:
data = pd.read_pickle('wind_pickle.pickle')

You can visualize the attributes in the dataset. Very important, the output attribute (i.e. the value to be predicted, **energy**, is the first attribute). **Steps** represents the hours in advance of the forecast. We will not use this variable here.

In [5]:
# The dataset contains 5937 instances and 556 attributes (including the outcome to be predicted)
print(data.shape)
#data.columns.values.tolist() 

(5937, 556)


In [6]:
#-1 for training, 0 for validation, 1 for testing
year_to_part = {
    2005: -1,
    2006: -1,
    2007: 0,
    2008: 0, 
    2009: 1,
    2010: 1
}
data['partition'] = data['year'].apply(lambda x: year_to_part[x])

We now remove the columns that cannot be used for training the models from the DataFrame

In [6]:
# Steps, month, day, hour, year should be removed, they cannot be used for training the models
to_remove = ['steps', 'month', 'year', 'day', 'hour']
for m in to_remove: data = data.drop(m, 1)

In [7]:
from numpy.random import randint

# we add na values at random
my_NIA = 100443936 + 100441714
np.random.seed(my_NIA)

how_many_nas = round(data.shape[0]*data.shape[1]*0.05)
print('Lets put '+str(how_many_nas)+' missing values \n')
x_locations = randint(0, data.shape[0], size=how_many_nas)
y_locations = randint(1, data.shape[1]-2, size=how_many_nas)

for i in range(len(x_locations)):
    data.iat[x_locations[i], y_locations[i]] = np.nan
    
data.to_pickle('wind_pickle_with_nan.pickle')

Lets put 163861 missing values 



From this point on, the file wind_pickle_with_nan should be used.

In [11]:
data = pd.read_pickle('wind_pickle_with_nan.pickle')
data.shape

(5937, 552)

## Input missing data

In [12]:
print(data.isnull().values.any())
input_cols = data.columns.difference(['energy', 'partition'])
x = data[input_cols]
#Iterative imputer (takes too long)
'''iter_imp = impute.IterativeImputer(random_state=random_state, 
                                   initial_strategy='median', 
                                   max_iter=3,
                                   verbose=verbose)
no_nan = iter_imp.fit_transform(x)'''

#KNN imputer(takes too long)
'''knn_imp = impute.KNNImputer(weights='distance')
no_nan = knn_imp.fit_transform(x)'''

#Simple imputer
simp_imp = impute.SimpleImputer(strategy='median',
                               verbose=2)
no_nan = simp_imp.fit_transform(x)

data[input_cols] = pd.DataFrame(data=no_nan)
print(data.isnull().values.any())

True
False


## Scaling

In [13]:
scaler = preprocessing.StandardScaler().fit(data[input_cols]) 
data[input_cols] = scaler.transform(data[input_cols])

## Data split
We are going to use train/test for model evaluation (outer) and train/validation for hyperparameter tuning (inner), as follows:     
1. Train partition: the first two years of data. Given that there are 6 years worth of data, we will use the first 2/6 of the instances for training.     
2. Validation partition: the second two years of data. 
3. Test partition: the remaining data    


In [14]:
#-1 for training, 0 for validation, 1 for testing
test = data[data['partition'] == 1]
train = data[data['partition'] != 1]
tr_val_partition = model_selection.PredefinedSplit(train['partition'].tolist())

del test['partition']
del train['partition']

y_test = test['energy']
x_test = test[test.columns.difference(['energy'])]

y_train = train['energy']
x_train = train[train.columns.difference(['energy'])]

Now we convert the training and test sets from Pandas DataFrames to Numpy matrices, so that they can be used by scikit-learn.

In [15]:
mat_x_train = np.matrix(x_train)
mat_y_train = np.matrix(y_train).T
mat_x_test = np.matrix(x_test)
mat_y_test = np.matrix(y_test).T

# 1. MODEL SELECTION AND HYPER-PARAMETER TUNING

In [17]:
#Dataframes with all the information of each model
summary = {
    'knn': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'N. neighbors', 'Weights', 'P']),
    'random_forest': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'Min. samples split', 'Criterion', 'Max. depth', 'N. estimators','Max. features']),
    'gradient_boosting': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)'])
}


## 1.1 KNN

### 1.1.1 Default hyper-parameters

In [29]:
np.random.seed(random_state)
knn_default = neighbors.KNeighborsRegressor()

start_time = time.time()
scores = -model_selection.cross_val_score(knn_default, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose) 
end_time = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(), 
    'N. neighbors': 5, 
    'Weights': 'uniform', 
    'P': 2
    }, 
    name='default'))

### 1.1.2 Hyper-parameter tunning (OPTUNA)

In [30]:
np.random.seed(random_state)
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', min_n_neigbors, max_n_neigbors)
    weights = trial.suggest_categorical('weights', ['uniform','distance'])
    p = trial.suggest_categorical('p', [1, 2])

    clf = neighbors.KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p)

    scores = -model_selection.cross_val_score(clf, x_train, y_train,
        cv=tr_val_partition,
        scoring='neg_root_mean_squared_error')

    return scores.mean()

knn_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
knn_optuna.optimize(knn_objective, n_trials=budget)
end_time = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': knn_optuna.best_value, 
    'N. neighbors': knn_optuna.best_params['n_neighbors'], 
    'Weights': knn_optuna.best_params['weights'], 
    'P': knn_optuna.best_params['p']
    }, 
    name='optuna'))

[32m[I 2021-01-12 17:39:31,240][0m A new study created in memory with name: no-name-ec2f521e-b91a-437a-b743-0fb883f9735a[0m
[32m[I 2021-01-12 17:39:36,424][0m Trial 0 finished with value: 434.58474235770177 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 434.58474235770177.[0m
[32m[I 2021-01-12 17:39:42,500][0m Trial 1 finished with value: 427.1897361344681 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 427.1897361344681.[0m
[32m[I 2021-01-12 17:39:47,733][0m Trial 2 finished with value: 436.66754039946005 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 427.1897361344681.[0m
[32m[I 2021-01-12 17:39:53,542][0m Trial 3 finished with value: 441.1599884529427 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 427.1897361344681.[0m
[32m[I 2021-01-12 17:39:59,120][0m Trial 4 finished with val

## 1.2 Random Forest

### 1.2.1 Default hyper-parameters

In [21]:
np.random.seed(random_state)
rf_default = ensemble.RandomForestRegressor(random_state=random_state, verbose=verbose)

start_time = time.time()
scores = -model_selection.cross_val_score(rf_default, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose)
end_time = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(),
    'Min. samples split': 2, 
    'Criterion': 'mse', 
    'Max. depth': 'None',
    'N. estimators': 100,
    'Max. features': 1
    },
    name='default'))

### 1.2.2 Hyper-parameter tunning (OPTUNA)

In [None]:
np.random.seed(random_state)
def random_forest_objective(trial):
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    criterion = trial.suggest_categorical('criterion', ['mse','friedman_mse'])
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    max_features = trial.suggest_uniform('max_features', 0+sys.float_info.min, 1)

    clf = ensemble.RandomForestRegressor(
        random_state=random_state,
        min_samples_split=min_samples_split,
        criterion=criterion,
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features
        )

    scores = -model_selection.cross_val_score(clf, x_train, y_train,
        cv=tr_val_partition,
        scoring='neg_root_mean_squared_error')
    print(scores)

    return scores.mean()

rf_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
rf_optuna.optimize(random_forest_objective, n_trials=budget)
end_time = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': rf_optuna.best_value,
    'Min. samples split': rf_optuna.best_params['min_samples_split'], 
    'Criterion': rf_optuna.best_params['criterion'], 
    'Max. depth': rf_optuna.best_params['max_depth'],
    'N. estimators': rf_optuna.best_params['n_estimators'],
    'Max. features': rf_optuna.best_params['max_features']
    },
    name='optuna'))

[32m[I 2021-01-12 17:19:15,385][0m A new study created in memory with name: no-name-afbb499f-328b-4ec8-916e-41408b563fe5[0m
[32m[I 2021-01-12 17:19:15,507][0m Trial 0 finished with value: 668.3157608288334 and parameters: {'min_samples_split': 0.8420626511506619, 'criterion': 'mse', 'max_depth': 11, 'n_estimators': 111, 'max_features': 0.22964889811063183}. Best is trial 0 with value: 668.3157608288334.[0m


[668.31576083]


[32m[I 2021-01-12 17:19:15,953][0m Trial 1 finished with value: 416.02693466304027 and parameters: {'min_samples_split': 0.22217725686158962, 'criterion': 'mse', 'max_depth': 10, 'n_estimators': 89, 'max_features': 0.014222760723832883}. Best is trial 1 with value: 416.02693466304027.[0m


[416.02693466]


[32m[I 2021-01-12 17:19:32,902][0m Trial 2 finished with value: 397.0549870471909 and parameters: {'min_samples_split': 0.27608113381936805, 'criterion': 'friedman_mse', 'max_depth': 6, 'n_estimators': 195, 'max_features': 0.3254333086193524}. Best is trial 2 with value: 397.0549870471909.[0m
[32m[I 2021-01-12 17:19:33,037][0m Trial 3 finished with value: 668.3520154857824 and parameters: {'min_samples_split': 0.7155418283828904, 'criterion': 'mse', 'max_depth': 16, 'n_estimators': 133, 'max_features': 0.9539498563134007}. Best is trial 2 with value: 397.0549870471909.[0m


[397.05498705]
[668.35201549]


[32m[I 2021-01-12 17:19:46,527][0m Trial 4 finished with value: 439.6219384407865 and parameters: {'min_samples_split': 0.4973733826326475, 'criterion': 'friedman_mse', 'max_depth': 13, 'n_estimators': 132, 'max_features': 0.7543669851904506}. Best is trial 2 with value: 397.0549870471909.[0m


[439.62193844]


[32m[I 2021-01-12 17:19:50,193][0m Trial 5 finished with value: 531.8125595160947 and parameters: {'min_samples_split': 0.5870028727181181, 'criterion': 'mse', 'max_depth': 7, 'n_estimators': 147, 'max_features': 0.3937671236597815}. Best is trial 2 with value: 397.0549870471909.[0m


[531.81255952]


[32m[I 2021-01-12 17:19:55,240][0m Trial 6 finished with value: 398.4895565140924 and parameters: {'min_samples_split': 0.1961521278613999, 'criterion': 'mse', 'max_depth': 6, 'n_estimators': 134, 'max_features': 0.13840208553807065}. Best is trial 2 with value: 397.0549870471909.[0m


[398.48955651]


[32m[I 2021-01-12 17:19:56,503][0m Trial 7 finished with value: 410.5623510360694 and parameters: {'min_samples_split': 0.2727794669539847, 'criterion': 'mse', 'max_depth': 18, 'n_estimators': 192, 'max_features': 0.021479311142142188}. Best is trial 2 with value: 397.0549870471909.[0m
[32m[I 2021-01-12 17:19:56,596][0m Trial 8 finished with value: 668.3138720807941 and parameters: {'min_samples_split': 0.9419565388143725, 'criterion': 'mse', 'max_depth': 16, 'n_estimators': 79, 'max_features': 0.14473233375850403}. Best is trial 2 with value: 397.0549870471909.[0m


[410.56235104]
[668.31387208]


[32m[I 2021-01-12 17:20:13,588][0m Trial 9 finished with value: 400.7885514258249 and parameters: {'min_samples_split': 0.3613713193049055, 'criterion': 'mse', 'max_depth': 5, 'n_estimators': 191, 'max_features': 0.4106695790225199}. Best is trial 2 with value: 397.0549870471909.[0m


[400.78855143]


[32m[I 2021-01-12 17:20:32,533][0m Trial 10 finished with value: 416.92737393683217 and parameters: {'min_samples_split': 0.012623405217639272, 'criterion': 'friedman_mse', 'max_depth': 3, 'n_estimators': 165, 'max_features': 0.6542220404285533}. Best is trial 2 with value: 397.0549870471909.[0m


[416.92737394]


[32m[I 2021-01-12 17:20:34,043][0m Trial 11 finished with value: 458.77092445372074 and parameters: {'min_samples_split': 0.007491517955698135, 'criterion': 'friedman_mse', 'max_depth': 2, 'n_estimators': 51, 'max_features': 0.23840977483705364}. Best is trial 2 with value: 397.0549870471909.[0m


[458.77092445]


## 1.3 Gradient Boosting

### 1.3.1 Default hyper-parameters

In [18]:
# implementation using sklearn
np.random.seed(random_state)
gb_sk_def = ensemble.GradientBoostingRegressor(random_state=random_state, verbose=verbose)

start_time = time.time()
scores = -model_selection.cross_val_score(gb_sk_def, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose)
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(),
    'Learning rate': 0.1,
    'N. estimators': 100,
    'Criterion': 'friedman_mse', 
    'Min. samples split': 2, 
    'Min. samples leaf': 1,
    'Max. depth': 3,
    'Max. leaf nodes': 'None'
    },
    name='default'))

In [51]:
# implementation using xgboost
import xgboost as xgb

dtrain = xgb.DMatrix(mat_x_train, label=mat_y_train)
dtest = xgb.DMatrix(mat_x_test, label=mat_y_test)

model = xgb.XGBRegressor(objective='reg:squarederror')

start_time = time.time()
scores = - model_selection.cross_val_score(model, x_train, y_train,
                                            cv=tr_val_partition,
                                            scoring='neg_root_mean_squared_error')
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(),
    'N. estimators': model.get_params()['n_estimators']
    },
    name='default'))

### 1.3.2 Hyper-parameter tunning

In [None]:
#print(dir(dtrain))
#print('\n', dir(model))
min_max_leaf_nodes = 2
max_max_leaf_nodes = 20
min_min_samples_leaf = 1
max_min_samples_leaf = 10

In [23]:
# hyperparam tuning for sklearn ensemble.GradientBoostingRegressor
np.random.seed(random_state)

def gradboosting_objective(trial):  
    gb_sk_opt = None
    some = 0
    
    if some == 1:
        learning_rate = trial.suggest_uniform('learning_rate', 0+sys.float_info.min, 1)
        n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
        criterion = trial.suggest_categorical('criterion', ['mse','friedman_mse', 'mae'])
        min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
        min_samples_leaf = trial.suggest_int('min_samples_split',min_min_samples_leaf, max_min_samples_leaf)
        max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
        max_leaf_nodes = trial.suggest_int('max_depth', min_max_leaf_nodes, max_max_leaf_nodes)
        
        
        gb_sk_opt = ensemble.GradientBoostingRegressor(learning_rate=learning_rate, 
                                                   n_estimators=n_estimators,
                                                   criterion=criterion,
                                                   min_samples_split=min_samples_split,
                                                   min_samples_leaf=min_samples_leaf,
                                                   max_depth=max_depth,
                                                   max_leaf_nodes=max_leaf_nodes,
                                                   random_state=random_state,
                                                   verbose=verbose)
    
    else:
        learning_rate = trial.suggest_uniform('learning_rate', 0+sys.float_info.min, 1)
        n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
        #criterion = trial.suggest_categorical('criterion', ['mse','friedman_mse', 'mae'])
        min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
        #min_samples_leaf = trial.suggest_int('min_samples_split',min_min_samples_leaf, max_min_samples_leaf)
        max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
        #max_leaf_nodes = trial.suggest_int('max_depth', min_max_leaf_nodes, max_max_leaf_nodes)
        
        
        gb_sk_opt = ensemble.GradientBoostingRegressor(learning_rate=learning_rate, 
                                                   n_estimators=n_estimators,
                                                   #criterion=criterion,
                                                   min_samples_split=min_samples_split,
                                                  # min_samples_leaf=min_samples_leaf,
                                                   max_depth=max_depth,
                                                  # max_leaf_nodes=max_leaf_nodes,
                                                   random_state=random_state,
                                                   verbose=verbose)
    
        
    
    scores = -model_selection.cross_val_score(gb_sk_opt, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose)

    return scores.mean()

gb_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
gb_optuna.optimize(gradboosting_objective, n_trials=budget)
end_time = time.time()

summary['gradient_boosting'] = summary['gradient_boosting'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': gb_optuna.best_value,
    'Learning rate': gb_optuna.best_params['learning_rate'],
    'N. estimators': gb_optuna.best_params['n_estimators'],
    'Criterion': 'friedman_mse', 
    'Min. samples split': gb_optuna.best_params['min_samples_split'], 
    #'Min. samples leaf': gb_optuna.best_params['min_samples_leaf'],
    'Min. samples leaf': 1,
    'Max. depth': gb_optuna.best_params['max_depth'],
    'Max. leaf nodes': 'None'
    #'Max. leaf nodes': gb_optuna.best_params['max_leaf_nodes']
    },
    name='optuna'))

[32m[I 2021-01-12 22:32:14,604][0m A new study created in memory with name: no-name-af1e068d-957b-41ab-9d66-55f63c91f777[0m
[32m[I 2021-01-12 22:32:35,641][0m Trial 0 finished with value: 417.1559653806665 and parameters: {'learning_rate': 0.6038207442734052, 'n_estimators': 91, 'min_samples_split': 0.9701149707547224, 'max_depth': 12}. Best is trial 0 with value: 417.1559653806665.[0m
[32m[I 2021-01-12 22:34:19,453][0m Trial 1 finished with value: 386.0131527292542 and parameters: {'learning_rate': 0.14739036993273402, 'n_estimators': 68, 'min_samples_split': 0.30631877787078476, 'max_depth': 15}. Best is trial 1 with value: 386.0131527292542.[0m
[32m[I 2021-01-12 22:38:03,094][0m Trial 2 finished with value: 398.5639575103852 and parameters: {'learning_rate': 0.26108146965486645, 'n_estimators': 124, 'min_samples_split': 0.1919876374112307, 'max_depth': 11}. Best is trial 1 with value: 386.0131527292542.[0m
[32m[I 2021-01-12 22:39:07,398][0m Trial 3 finished with value:

In [31]:
summary['knn']

Unnamed: 0,Time (sec),Score (RMSE),N. neighbors,Weights,P
default,5.0999,455.123868,5,uniform,2
optuna,109.5012,425.095919,9,uniform,1


In [27]:
summary['random_forest']

Unnamed: 0,Time (sec),Score (RMSE),Min. samples split,Criterion,Max. depth,N. estimators,Max. features
default,0.0,375.560721,2.0,mse,,100,1.0
default,73.5753,375.560721,2.0,mse,,100,1.0
optuna,183.535,377.30366,0.037649,mse,10.0,52,0.227018


In [64]:
summary['gradient_boosting']

Unnamed: 0,Time (sec),Score (RMSE),Criterion,Learning rate,Max. depth,Max. leaf nodes,Min. samples leaf,Min. samples split,N. estimators
default,32.481,389.357849,friedman_mse,0.1,3.0,,1.0,2.0,100.0
optuna,2598.5261,381.286402,friedman_mse,0.154706,7.0,,1.0,0.434674,74.0


# 2. ATTRIBUTE SELECTION

You have to answer the following questions: 

- Are all 550 input attributes actually necessary in order to get a good model? Is it possible to have an accurate model that uses fewer than 550 variables? How many? 
- Is it enough to use only the attributes for the actual Sotavento location? (13th location in the grid)

In [None]:
#<USE AS MANY CELLS AS YOU NEED>