# Wind prediction - Second assignment

## Authors

David Moreno Maldonado 100441714     
Inés Fernández Campos 100443936

## 0. Preliminaries

In [1]:
# Import some libraries
import os
import numpy as np              
import pandas as pd
import matplotlib.pyplot as plt 

import sys
import time
import math

from sklearn.experimental import enable_iterative_imputer
from sklearn import preprocessing, impute, model_selection, neighbors, ensemble
import optuna
import optuna.visualization as ov

os.getcwd()

'/Users/roni/Desktop/master/2nd quarter/big data intelligence/assignments/assignment_2'

In [2]:
#MAIN PARAMETERS FOR THE ASSIGNMENT
budget = 20
random_state = 0
verbose = 0

#PARAMETERS FOR THE HYPER-PARAMETER TUNNING
#KNN
min_n_neigbors = 1
max_n_neigbors = 16

#RANDOM FOREST
min_max_depth = 2
max_max_depth = 20
min_n_estimators = 50
max_n_estimators = 200

The "wind_pickle" file contains data in a binary format called "Pickle". Pickle data loads faster than text data.

In [3]:
data = pd.read_pickle('wind_pickle.pickle')

You can visualize the attributes in the dataset. Very important, the output attribute (i.e. the value to be predicted, **energy**, is the first attribute). **Steps** represents the hours in advance of the forecast. We will not use this variable here.

In [4]:
# The dataset contains 5937 instances and 556 attributes (including the outcome to be predicted)
print(data.shape)
#data.columns.values.tolist() 

(5937, 556)


In [5]:
#-1 for training, 0 for validation, 1 for testing
year_to_part = {
    2005: -1,
    2006: -1,
    2007: 0,
    2008: 0, 
    2009: 1,
    2010: 1
}
data['partition'] = data['year'].apply(lambda x: year_to_part[x])

We now remove the columns that cannot be used for training the models from the DataFrame

In [6]:
# Steps, month, day, hour, year should be removed, they cannot be used for training the models
to_remove = ['steps', 'month', 'year', 'day', 'hour']
for m in to_remove: data = data.drop(m, 1)

In [7]:
from numpy.random import randint

# we add na values at random
my_NIA = 100443936 + 100441714
np.random.seed(my_NIA)

how_many_nas = round(data.shape[0]*data.shape[1]*0.05)
print('Lets put '+str(how_many_nas)+' missing values \n')
x_locations = randint(0, data.shape[0], size=how_many_nas)
y_locations = randint(1, data.shape[1]-2, size=how_many_nas)

for i in range(len(x_locations)):
    data.iat[x_locations[i], y_locations[i]] = np.nan
    
data.to_pickle('wind_pickle_with_nan.pickle')

Lets put 163861 missing values 



From this point on, the file wind_pickle_with_nan should be used.

In [8]:
data = pd.read_pickle('wind_pickle_with_nan.pickle')
data.shape

(5937, 552)

## Input missing data

In [9]:
print(data.isnull().values.any())
input_cols = data.columns.difference(['energy', 'partition'])
x = data[input_cols]
#Iterative imputer (too long)
'''iter_imp = impute.IterativeImputer(random_state=random_state, 
                                   initial_strategy='median', 
                                   max_iter=3,
                                   verbose=verbose)
no_nan = iter_imp.fit_transform(x)'''

#KNN imputer(too long)
'''knn_imp = impute.KNNImputer(weights='distance')
no_nan = knn_imp.fit_transform(x)'''

#Simple imputer
simp_imp = impute.SimpleImputer(strategy='median',
                               verbose=2)
no_nan = simp_imp.fit_transform(x)

data[input_cols] = pd.DataFrame(data=no_nan)
print(data.isnull().values.any())

True
False


## Scaling

In [10]:
scaler = preprocessing.StandardScaler().fit(data[input_cols]) 
data[input_cols] = scaler.transform(data[input_cols])

## Data split
We are going to use train/test for model evaluation (outer) and train/validation for hyperparameter tuning (inner), as follows:     
1. Train partition: the first two years of data. Given that there are 6 years worth of data, we will use the first 2/6 of the instances for training.     
2. Validation partition: the second two years of data. 
3. Test partition: the remaining data    


In [11]:
#-1 for training, 0 for validation, 1 for testing
test = data[data['partition'] == 1]
train = data[data['partition'] != 1]
tr_val_partition = model_selection.PredefinedSplit(train['partition'].tolist())

del test['partition']
del train['partition']

y_test = test['energy']
x_test = test[test.columns.difference(['energy'])]

y_train = train['energy']
x_train = train[train.columns.difference(['energy'])]

Now we convert the training and test sets from Pandas DataFrames to Numpy matrices, so that they can be used by scikit-learn.

In [12]:
mat_x_train = np.matrix(x_train)
mat_y_train = np.matrix(y_train).T
mat_x_test = np.matrix(x_test)
mat_y_test = np.matrix(y_test).T

# 1. MODEL SELECTION AND HYPER-PARAMETER TUNING

In [13]:
#Dataframes with all the information of each model
summary = {
    'knn': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'N. neighbors', 'Weights', 'P']),
    'random_forest': pd.DataFrame(columns=['Time (sec)', 'Score (RMSE)', 'Min. samples split', 'Criterion', 'Max. depth', 'N. estimators','Max. features'])
}

## 1.1 KNN

### 1.1.1 Default hyper-parameters

In [None]:
np.random.seed(random_state)
knn_default = neighbors.KNeighborsRegressor()

start = time.time()
scores = -model_selection.cross_val_score(knn_default, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose) 
end = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(), 
    'N. neighbors': 5, 
    'Weights': 'uniform', 
    'P': 2
    }, 
    name='default'))

### 1.1.2 Hyper-parameter tunning (OPTUNA)

In [None]:
np.random.seed(random_state)
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', min_n_neigbors, max_n_neigbors)
    weights = trial.suggest_categorical('weights', ['uniform','distance'])
    p = trial.suggest_categorical('p', [1, 2])

    clf = neighbors.KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p)

    scores = -model_selection.cross_val_score(clf, x_train, y_train,
        cv=tr_val_partition,
        scoring='neg_root_mean_squared_error')

    return scores.mean()

knn_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
knn_optuna.optimize(knn_objective, n_trials=budget)
end_time = time.time()

summary['knn'] = summary['knn'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': knn_optuna.best_value, 
    'N. neighbors': knn_optuna.best_params['n_neighbors'], 
    'Weights': knn_optuna.best_params['weights'], 
    'P': knn_optuna.best_params['p']
    }, 
    name='optuna'))

## 1.2 Random Forest

### 1.2.1 Default hyper-parameters

In [14]:
np.random.seed(random_state)
rf_default = ensemble.RandomForestRegressor(random_state=random_state, verbose=verbose)

start = time.time()
scores = -model_selection.cross_val_score(rf_default, x_train, y_train,
                                          scoring='neg_root_mean_squared_error',
                                          cv=tr_val_partition,
                                          verbose=verbose)
end = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': scores.mean(),
    'Min. samples split': 2, 
    'Criterion': 'mse', 
    'Max. depth': 'None',
    'N. estimators': 100,
    'Max. features': 1
    },
    name='default'))

In [16]:
summary['random_forest']

Unnamed: 0,Time (sec),Score (RMSE),Min. samples split,Criterion,Max. depth,N. estimators,Max. features
default,0,375.560721,2,mse,,100,1


### 1.2.2 Hyper-parameter tunning (OPTUNA)

In [15]:
np.random.seed(random_state)
def random_forest_objective(trial):
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
    criterion = trial.suggest_categorical('criterion', ['mse','friedman_mse'])
    max_depth = trial.suggest_int('max_depth', min_max_depth, max_max_depth)
    n_estimators = trial.suggest_int('n_estimators', min_n_estimators, max_n_estimators)
    max_features = trial.suggest_uniform('max_features', 0+sys.float_info.min, 1)

    clf = ensemble.RandomForestRegressor(
        random_state=random_state,
        min_samples_split=min_samples_split,
        criterion=criterion,
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features
        )

    scores = -model_selection.cross_val_score(clf, x_train, y_train,
        cv=tr_val_partition,
        scoring='neg_root_mean_squared_error')

    return scores.mean()

rf_optuna = optuna.create_study(direction='minimize')
start_time = time.time()
rf_optuna.optimize(random_forest_objective, n_trials=budget)
end_time = time.time()

summary['random_forest'] = summary['random_forest'].append(pd.Series({
    'Time (sec)': '{:.4f}'.format(end_time - start_time), 
    'Score (RMSE)': rf_optuna.best_value,
    'Min. samples split': rf_optuna.best_params['min_samples_split'], 
    'Criterion': rf_optuna.best_params['criterion'], 
    'Max. depth': rf_optuna.best_params['max_depth'],
    'N. estimators': rf_optuna.best_params['n_estimators'],
    'Max. features': rf_optuna.best_params['max_features']
    },
    name='optuna'))

[32m[I 2021-01-12 17:03:55,526][0m A new study created in memory with name: no-name-a9f89a39-3ff0-41b3-9e7b-2c21f44990de[0m
[33m[W 2021-01-12 17:03:55,528][0m Trial 0 failed because of the following error: NameError("name 'sys' is not defined")
Traceback (most recent call last):
  File "/usr/local/anaconda3/envs/master/lib/python3.8/site-packages/optuna/_optimize.py", line 189, in _run_trial
    value = func(trial)
  File "<ipython-input-15-a869bb9179aa>", line 3, in random_forest_objective
    min_samples_split = trial.suggest_uniform('min_samples_split', 0+sys.float_info.min, 1)
NameError: name 'sys' is not defined[0m


NameError: name 'sys' is not defined

## 1.3 Gradient Boosting

### 1.3.1 Default hyper-parameters

In [None]:
#code

### 1.3.2 Hyper-parameter tunning

In [None]:
#code

In [None]:


rf = RandomForestRegressor()
# help('sklearn.ensemble.RandomForestRegressor')

gb = GradientBoostingRegressor()
# help('sklearn.ensemble.GradientBoostingRegressor')

# YOU CAN USE ADVANCED IMPLEMTATIONS OF GRADIENT BOOSTING: XGBOOST, LIGHTBOOST, CATBOOST, ...

# 2. ATTRIBUTE SELECTION

You have to answer the following questions: 

- Are all 550 input attributes actually necessary in order to get a good model? Is it possible to have an accurate model that uses fewer than 550 variables? How many? 
- Is it enough to use only the attributes for the actual Sotavento location? (13th location in the grid)

In [None]:
#<USE AS MANY CELLS AS YOU NEED>