In [1]:
import pandas as pd
import numpy as np  


import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import sklearn


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import optuna

import xgboost as xgb


In [2]:
solar_train = pd.read_csv('../data/external/solar_train.csv')
solar_train.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475175023,9/29/2016 12:00:00 AM,08:50:23,634.99,61,30.46,41,14.96,6.75,06:13:00,18:13:00
1,1481799902,12/15/2016 12:00:00 AM,01:05:02,1.27,37,30.26,70,207.43,5.62,06:50:00,17:46:00
2,1478339417,11/4/2016 12:00:00 AM,23:50:17,1.21,47,30.49,33,168.2,5.62,06:25:00,17:47:00
3,1472887208,9/2/2016 12:00:00 AM,21:20:08,1.67,54,30.46,101,152.6,3.37,06:07:00,18:37:00
4,1478724901,11/9/2016 12:00:00 AM,10:55:01,839.78,62,30.47,36,291.95,7.87,06:28:00,17:45:00


In [3]:
sol_phys_train = pd.read_csv('../data/interim/solar_phys_time_train.csv')
sol_phys_train = sol_phys_train.drop('Unnamed: 0', axis=1)
sol_phys_train.head()

Unnamed: 0,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Time,SunTime,DayOfYear,DayOfWeek
0,634.99,289.26,4060.99,41,0.0416,3.02,0.368322,0.5,273,3
1,1.27,275.93,4034.32,70,0.5762,2.51,0.045162,0.455556,350,3
2,1.21,281.48,4064.99,33,0.4672,2.51,0.993252,0.473611,309,4
3,1.67,285.37,4060.99,101,0.4239,1.51,0.888981,0.520833,246,4
4,839.78,289.82,4062.32,36,0.811,3.52,0.454873,0.470139,314,2


In [5]:
#Pressure is irrelevant

sol_phys_train.drop('Pressure', axis=1, inplace=True)


In [6]:

sol_phys_train.head()


Unnamed: 0,Radiation,Temperature,Humidity,WindDirection(Degrees),Speed,Time,SunTime,DayOfYear,DayOfWeek
0,634.99,289.26,41,0.0416,3.02,0.368322,0.5,273,3
1,1.27,275.93,70,0.5762,2.51,0.045162,0.455556,350,3
2,1.21,281.48,33,0.4672,2.51,0.993252,0.473611,309,4
3,1.67,285.37,101,0.4239,1.51,0.888981,0.520833,246,4
4,839.78,289.82,36,0.811,3.52,0.454873,0.470139,314,2


In [8]:
sol_phys_train['WeekOfYear'] = solar_train['Data'].apply(lambda x:  pd.Period(x.split(" ")[0], freq='W').weekofyear)

In [12]:
sol_phys_train['Month'] = solar_train['Data'].apply(lambda x:  pd.Period(x.split(" ")[0], freq='M').month)

In [16]:

sol_phys_train['TimeSunRise'] = pd.to_timedelta(solar_train['TimeSunRise'])/pd.to_timedelta(1, unit='D')
sol_phys_train['TimeSunSet'] = pd.to_timedelta(solar_train['TimeSunSet'])/pd.to_timedelta(1, unit='D')


In [17]:
sol_phys_train.head()

Unnamed: 0,Radiation,Temperature,Humidity,WindDirection(Degrees),Speed,Time,SunTime,DayOfYear,DayOfWeek,WeekOfYear,Month,TimeSunRise,TimeSunSet
0,634.99,289.26,41,0.0416,3.02,0.368322,0.5,273,3,39,9,0.259028,0.759028
1,1.27,275.93,70,0.5762,2.51,0.045162,0.455556,350,3,50,12,0.284722,0.740278
2,1.21,281.48,33,0.4672,2.51,0.993252,0.473611,309,4,44,11,0.267361,0.740972
3,1.67,285.37,101,0.4239,1.51,0.888981,0.520833,246,4,35,9,0.254861,0.775694
4,839.78,289.82,36,0.811,3.52,0.454873,0.470139,314,2,45,11,0.269444,0.739583


In [18]:
sol_phys_train.to_csv('../data/interim/solar_pro1_train.csv')

In [21]:
sol_phys_test = pd.read_csv('../data/interim/solar_phys_time_test.csv')
sol_phys_test = sol_phys_test.drop('Unnamed: 0', axis=1)
sol_phys_test.head()

Unnamed: 0,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Time,SunTime,DayOfYear,DayOfWeek
0,288.15,4062.32,44,0.8685,1.51,0.399387,0.470139,314,2
1,288.15,4063.65,83,0.1056,3.02,0.503507,0.510417,260,4
2,281.48,4051.66,78,0.5934,2.51,0.180799,0.490972,284,0
3,280.37,4052.99,98,0.4906,2.01,0.28537,0.45625,346,6
4,280.37,4052.99,34,0.4886,3.02,0.198113,0.479861,300,2


In [22]:
#Pressure is irrelevant

sol_phys_test.drop('Pressure', axis=1, inplace=True)


In [23]:
solar_original_test = pd.read_csv('../data/external/solar_test.csv')
solar_original_test.head()

Unnamed: 0,id,UNIXTime,Data,Time,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,0,1478720107,11/9/2016 12:00:00 AM,09:35:07,59,30.47,44,312.67,3.37,06:28:00,17:45:00
1,1,1474063503,9/16/2016 12:00:00 AM,12:05:03,59,30.48,83,38.01,6.75,06:10:00,18:25:00
2,2,1476109221,10/10/2016 12:00:00 AM,04:20:21,47,30.39,78,213.62,5.62,06:16:00,18:03:00
3,3,1481475056,12/11/2016 12:00:00 AM,06:50:56,45,30.4,98,176.63,4.5,06:47:00,17:44:00
4,4,1477493117,10/26/2016 12:00:00 AM,04:45:17,45,30.4,34,175.89,6.75,06:21:00,17:52:00


In [24]:

sol_phys_test['WeekOfYear'] = solar_original_test['Data'].apply(lambda x:  pd.Period(x.split(" ")[0], freq='W').weekofyear)
sol_phys_test['Month'] = solar_original_test['Data'].apply(lambda x:  pd.Period(x.split(" ")[0], freq='M').month)
sol_phys_test['TimeSunRise'] = pd.to_timedelta(solar_original_test['TimeSunRise'])/pd.to_timedelta(1, unit='D')
sol_phys_test['TimeSunSet'] = pd.to_timedelta(solar_original_test['TimeSunSet'])/pd.to_timedelta(1, unit='D')


In [26]:
sol_phys_test.head()

Unnamed: 0,Temperature,Humidity,WindDirection(Degrees),Speed,Time,SunTime,DayOfYear,DayOfWeek,WeekOfYear,Month,TimeSunRise,TimeSunSet
0,288.15,44,0.8685,1.51,0.399387,0.470139,314,2,45,11,0.269444,0.739583
1,288.15,83,0.1056,3.02,0.503507,0.510417,260,4,37,9,0.256944,0.767361
2,281.48,78,0.5934,2.51,0.180799,0.490972,284,0,41,10,0.261111,0.752083
3,280.37,98,0.4906,2.01,0.28537,0.45625,346,6,49,12,0.282639,0.738889
4,280.37,34,0.4886,3.02,0.198113,0.479861,300,2,43,10,0.264583,0.744444


In [27]:
sol_phys_test.to_csv('../data/interim/solar_pro1_tes.csv')

In [28]:
X = sol_phys_train[[c for c in sol_phys_train if c != 'Radiation']].values
y = sol_phys_train[['Radiation']].values

In [29]:
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y,random_state=0, test_size=0.30)

In [30]:
def objective_rm(trial):
# Invoke suggest methods of a Trial object to generate hyperparameters.

    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 64)
    rf_max_estimators = trial.suggest_int('n_estimators', 1, 3000)
        
    
    model = RandomForestRegressor(max_depth=rf_max_depth, criterion='mse', n_estimators = rf_max_estimators)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    error = sklearn.metrics.mean_squared_error(y_val, y_pred)
    
    
    return error # An objective value linked with the Trial object.

In [31]:
study_rm = optuna.create_study()
study_rm.optimize(objective_rm, n_trials=100)
study_rm.best_params


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[I 2020-05-05 22:22:27,923] Finished trial#0 with value: 6971.250833098634 with parameters: {'rf_max_depth': 16, 'n_estimators': 2502}. Best is trial#0 with value: 6971.250833098634.

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[I 2020-05-05 22:26:02,310] Finished trial#1 with value: 6946.910960739936 with parameters: {'rf_max_depth': 36, 'n_estimators': 2045}. Best is trial#1 with value: 6946.910960739936.

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

[I 2020-05-05 22:28:55,774] Finished trial#2 with value: 10885.166879204642 with parameters: {'rf_max_depth': 8, 'n_estimators': 2773}. Best is trial#1 with value: 6946.910960739936.

A column-vector y was passed when a 1d array was

KeyboardInterrupt: 