In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn import neighbors
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.simplefilter('ignore')
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Creates datetime data

In [2]:
# Creates date_cet, date, month, day_of_week and time features
df_datetime = pd.date_range(start='01/01/2015 00:00:00', end='12/31/2019 23:00:00',freq='H', tz='Europe/Berlin').to_series()
date = df_datetime.dt.date
month = df_datetime.dt.month
day_of_week = df_datetime.dt.dayofweek
time = df_datetime.dt.hour
df_datetime = pd.DataFrame(df_datetime,columns = ['Date_cet']).reset_index().drop("index", axis=1)

In [3]:
def create_datetime(data,name):
    data=data.values.tolist()
    data=pd.DataFrame(data,columns = [name])
    return data

In [4]:
date = create_datetime(date,'Date')
month = create_datetime(month,'Month')
day_of_week = create_datetime(day_of_week,'Day_of_week')
time = create_datetime(time,'Time')

In [5]:
# Merges data
df_date = pd.concat([df_datetime,date['Date'],month['Month'],day_of_week['Day_of_week'],time['Time']],axis=1)
df_date.head()

Unnamed: 0,Date_cet,Date,Month,Day_of_week,Time
0,2015-01-01 00:00:00+01:00,2015-01-01,1,3,0
1,2015-01-01 01:00:00+01:00,2015-01-01,1,3,1
2,2015-01-01 02:00:00+01:00,2015-01-01,1,3,2
3,2015-01-01 03:00:00+01:00,2015-01-01,1,3,3
4,2015-01-01 04:00:00+01:00,2015-01-01,1,3,4


In [6]:
# Creates weekend
day_of_week_dummy = pd.get_dummies(df_date['Day_of_week']).astype(int)
day_of_week_dummy['Weekend'] = day_of_week_dummy[5] + day_of_week_dummy[6]
df_date = pd.concat([df_date, day_of_week_dummy['Weekend']],axis=1)
df_date.head()

Unnamed: 0,Date_cet,Date,Month,Day_of_week,Time,Weekend
0,2015-01-01 00:00:00+01:00,2015-01-01,1,3,0,0
1,2015-01-01 01:00:00+01:00,2015-01-01,1,3,1,0
2,2015-01-01 02:00:00+01:00,2015-01-01,1,3,2,0
3,2015-01-01 03:00:00+01:00,2015-01-01,1,3,3,0
4,2015-01-01 04:00:00+01:00,2015-01-01,1,3,4,0


In [7]:
# Creates publich holidays in Germany
import holidays
def get_holidays(year):
    holidays_date = []
    for date in sorted(holidays.DE(years=year).items()):
        holidays_date.append(str(date[0]))
    return holidays_date

year = [2015,2016,2017,2018,2019]
holidays_list = get_holidays(year)
holidays_list = '|'.join(holidays_list)
df_date['Date'] = df_date['Date'].apply(str)
holidays = df_date['Date'].str.contains(holidays_list)
holidays *= 1 # True = 1, False = 0
holidays=holidays.rename("Public_holidays")
df_date = pd.concat([df_date,holidays],axis=1)
df_date.head()

Unnamed: 0,Date_cet,Date,Month,Day_of_week,Time,Weekend,Public_holidays
0,2015-01-01 00:00:00+01:00,2015-01-01,1,3,0,0,1
1,2015-01-01 01:00:00+01:00,2015-01-01,1,3,1,0,1
2,2015-01-01 02:00:00+01:00,2015-01-01,1,3,2,0,1
3,2015-01-01 03:00:00+01:00,2015-01-01,1,3,3,0,1
4,2015-01-01 04:00:00+01:00,2015-01-01,1,3,4,0,1


In [8]:
# Creat weekend with holidays
df_date['Weekend_with_holidays'] = df_date['Weekend'] + df_date['Public_holidays']
df_date["Weekend_with_holidays"].replace({2: 1}, inplace=True)
df_date.drop(['Weekend', 'Public_holidays'], axis=1, inplace=True)
df_date.head()

Unnamed: 0,Date_cet,Date,Month,Day_of_week,Time,Weekend_with_holidays
0,2015-01-01 00:00:00+01:00,2015-01-01,1,3,0,1
1,2015-01-01 01:00:00+01:00,2015-01-01,1,3,1,1
2,2015-01-01 02:00:00+01:00,2015-01-01,1,3,2,1
3,2015-01-01 03:00:00+01:00,2015-01-01,1,3,3,1
4,2015-01-01 04:00:00+01:00,2015-01-01,1,3,4,1


In [9]:
# Creates seasons
df_date['Month'] = df_date['Month'].apply(str)
df_date['Seasons'] = df_date['Month']
df_date.loc[df_date['Seasons'].str.contains('3|4|5'),'Seasons'] = '0' # Spring
df_date.loc[df_date['Seasons'].str.contains('6|7|8'),'Seasons'] = '1' # Summer
df_date.loc[df_date['Seasons'].str.contains('9|10|11'),'Seasons'] = '2' # Autumn
df_date.loc[df_date['Seasons'].str.contains('12|1|2'),'Seasons'] = '3' # Winter
df_date[['Seasons','Month']] = df_date[['Seasons','Month']].astype(int)
df_date['Date_cet'] = df_date['Date_cet'].apply(str)
df_date.head()

Unnamed: 0,Date_cet,Date,Month,Day_of_week,Time,Weekend_with_holidays,Seasons
0,2015-01-01 00:00:00+01:00,2015-01-01,1,3,0,1,3
1,2015-01-01 01:00:00+01:00,2015-01-01,1,3,1,1,3
2,2015-01-01 02:00:00+01:00,2015-01-01,1,3,2,1,3
3,2015-01-01 03:00:00+01:00,2015-01-01,1,3,3,1,3
4,2015-01-01 04:00:00+01:00,2015-01-01,1,3,4,1,3


In [10]:
df_date.dtypes

Date_cet                 object
Date                     object
Month                     int64
Day_of_week               int64
Time                      int64
Weekend_with_holidays     int64
Seasons                   int64
dtype: object

# Data Cleansing

In [11]:
# Imports data
df = pd.read_csv('German_data.csv')
df.shape

(43824, 20)

In [12]:
# Shows features
list(df)

['Date_CET',
 'Day-ahead[Euro/MWh]',
 'Load_actual[MWh]',
 'Load_forecast[MWh]',
 'Fossil_gas_actual[MWh]',
 'Fossil_brown_coal_actual[MWh]',
 'Fossil_hard_coal_actual[MWh]',
 'Nuclear_actual[MWh]',
 'Other_conventional_actual[MWh]',
 'Hydro_pumped_storage_actual[MWh]',
 'Hydropower_actual[MWh]',
 'Biomass_actual[MWh]',
 'Photovoltaics_actual[MWh]',
 'Wind_onshore_actual[MWh]',
 'Wind_offshore_actual[MWh]',
 'Other_renewable_actual[MWh]',
 'Photovoltaics_forecast[MWh]',
 'Wind_onshore_forecast[MWh]',
 'Wind_offshore_forecast[MWh]',
 'Other_forecast[MWh]']

In [13]:
# Drops nan
df_drop_nan = df.dropna()
df_drop_nan.shape

(42719, 20)

In [14]:
# Creates list of features without date
all_list = list(df)
all_list_without_data_price = all_list.copy()
all_list_without_data_price.remove('Date_CET')
all_list_without_data_price.remove('Day-ahead[Euro/MWh]')

In [15]:
# Handles skewed data
df_drop_nan_skewed = df_drop_nan.copy()

list_skewed = [] # list of skewed data features
for i in all_list_without_data_price:
    s=round(df_drop_nan_skewed[i].skew(),3)
    if s > 0.5 or s < -0.5:
        list_skewed.append(i)
        pass

ld = [] # Ceate list of lambda which requires reversing values
for i in list_skewed:
    df_drop_nan_skewed[i],lmbda = stats.boxcox(1+df_drop_nan_skewed[i])
    ld.append(lmbda)

In [16]:
# Drops outliers which is out of ±3 z-score
df_drop_nan_skewed_outliers = df_drop_nan_skewed.copy()

for i in all_list_without_data_price:
    sd = 3
    upper_lim = df_drop_nan_skewed_outliers[i].mean() + df_drop_nan_skewed_outliers[i].std() * sd
    lower_lim = df_drop_nan_skewed_outliers[i].mean() - df_drop_nan_skewed_outliers[i].std() * sd
    df_drop_nan_skewed_outliers = df_drop_nan_skewed_outliers[(df_drop_nan_skewed_outliers[i] < upper_lim) & (df_drop_nan_skewed_outliers[i] > lower_lim)]
    
df_drop_nan_skewed_outliers.shape

(42643, 20)

In [17]:
X = df_drop_nan_skewed_outliers[all_list_without_data_price]

In [18]:
# Normalization of the data
X_array = X.values
min_max_scaler = preprocessing.MinMaxScaler()
normalized_X = min_max_scaler.fit_transform(X_array)
normalized_X = pd.DataFrame(normalized_X)
normalized_X.rename(columns={i:j for i,j in zip(list(range(len(all_list_without_data_price))),all_list_without_data_price)}, inplace=True)

In [19]:
# Creates Date_CET and Day-ahead data
df_date_day_ahead = df_drop_nan_skewed_outliers.copy()
df_date_day_ahead = df_date_day_ahead[['Date_CET','Day-ahead[Euro/MWh]']].reset_index()

In [20]:
# Merges normalized data, and Date_CET and Day-ahead data 
df_normalized_data = pd.concat([df_date_day_ahead,normalized_X],axis=1).drop("index", axis=1)

In [21]:
# Standardization of the data
standard_scaler = preprocessing.StandardScaler()
standard_X = standard_scaler.fit_transform(X_array)
standard_X = pd.DataFrame(standard_X)
standard_X.rename(columns={i:j for i,j in zip(list(range(len(all_list_without_data_price))),all_list_without_data_price)}, inplace=True)

In [22]:
# Merges Standardized data, and Date_CET and Day-ahead data 
df_standardized_data = pd.concat([df_date_day_ahead,standard_X],axis=1).drop("index", axis=1)

In [23]:
# Merges German data and date_data
df_normalized_data_with_date = pd.merge(df_normalized_data, df_date, left_on='Date_CET', right_on='Date_cet', how='left').drop("Date_cet", axis=1)
df_standardized_data_with_date = pd.merge(df_standardized_data, df_date, left_on='Date_CET', right_on='Date_cet', how='left').drop("Date_cet", axis=1)

In [24]:
pd.set_option('display.max_columns', 100)

In [25]:
print(df_normalized_data_with_date.shape)
print(df_normalized_data_with_date.dtypes)
df_normalized_data_with_date.head()

(42643, 26)
Date_CET                             object
Day-ahead[Euro/MWh]                 float64
Load_actual[MWh]                    float64
Load_forecast[MWh]                  float64
Fossil_gas_actual[MWh]              float64
Fossil_brown_coal_actual[MWh]       float64
Fossil_hard_coal_actual[MWh]        float64
Nuclear_actual[MWh]                 float64
Other_conventional_actual[MWh]      float64
Hydro_pumped_storage_actual[MWh]    float64
Hydropower_actual[MWh]              float64
Biomass_actual[MWh]                 float64
Photovoltaics_actual[MWh]           float64
Wind_onshore_actual[MWh]            float64
Wind_offshore_actual[MWh]           float64
Other_renewable_actual[MWh]         float64
Photovoltaics_forecast[MWh]         float64
Wind_onshore_forecast[MWh]          float64
Wind_offshore_forecast[MWh]         float64
Other_forecast[MWh]                 float64
Date                                 object
Month                                 int64
Day_of_week         

Unnamed: 0,Date_CET,Day-ahead[Euro/MWh],Load_actual[MWh],Load_forecast[MWh],Fossil_gas_actual[MWh],Fossil_brown_coal_actual[MWh],Fossil_hard_coal_actual[MWh],Nuclear_actual[MWh],Other_conventional_actual[MWh],Hydro_pumped_storage_actual[MWh],Hydropower_actual[MWh],Biomass_actual[MWh],Photovoltaics_actual[MWh],Wind_onshore_actual[MWh],Wind_offshore_actual[MWh],Other_renewable_actual[MWh],Photovoltaics_forecast[MWh],Wind_onshore_forecast[MWh],Wind_offshore_forecast[MWh],Other_forecast[MWh],Date,Month,Day_of_week,Time,Weekend_with_holidays,Seasons
0,2015-01-01 00:00:00+01:00,25.02,0.223536,0.283125,0.393846,0.508969,0.125403,0.893373,0.577126,0.567658,0.181682,0.198779,0.0,0.587724,0.331501,0.428571,0.0,0.578877,0.232431,0.461136,2015-01-01,1,3,0,1,3
1,2015-01-01 01:00:00+01:00,18.29,0.193462,0.236697,0.248136,0.47453,0.08359,0.946196,0.584182,0.486191,0.202566,0.179778,0.0,0.592219,0.331218,0.377551,0.0,0.592732,0.233109,0.433921,2015-01-01,1,3,1,1,3
2,2015-01-01 02:00:00+01:00,16.04,0.169196,0.212131,0.205872,0.429401,0.078345,0.937649,0.588437,0.430032,0.168627,0.196012,0.0,0.598508,0.330651,0.229592,0.0,0.596441,0.233109,0.410048,2015-01-01,1,3,2,1,3
3,2015-01-01 03:00:00+01:00,14.6,0.148753,0.205272,0.206357,0.369037,0.084913,0.938069,0.586918,0.236484,0.156775,0.206242,0.0,0.598711,0.332066,0.193878,0.0,0.598856,0.233109,0.379523,2015-01-01,1,3,3,1,3
4,2015-01-01 04:00:00+01:00,14.95,0.140692,0.208537,0.206841,0.371831,0.08756,0.928822,0.588025,0.300635,0.15114,0.204833,0.0,0.601095,0.332911,0.183673,0.0,0.60327,0.233671,0.377868,2015-01-01,1,3,4,1,3


In [26]:
df_standardized_data_with_date.head()

Unnamed: 0,Date_CET,Day-ahead[Euro/MWh],Load_actual[MWh],Load_forecast[MWh],Fossil_gas_actual[MWh],Fossil_brown_coal_actual[MWh],Fossil_hard_coal_actual[MWh],Nuclear_actual[MWh],Other_conventional_actual[MWh],Hydro_pumped_storage_actual[MWh],Hydropower_actual[MWh],Biomass_actual[MWh],Photovoltaics_actual[MWh],Wind_onshore_actual[MWh],Wind_offshore_actual[MWh],Other_renewable_actual[MWh],Photovoltaics_forecast[MWh],Wind_onshore_forecast[MWh],Wind_offshore_forecast[MWh],Other_forecast[MWh],Date,Month,Day_of_week,Time,Weekend_with_holidays,Seasons
0,2015-01-01 00:00:00+01:00,25.02,-1.369105,-1.354341,-0.849546,0.439218,-1.041157,1.395948,0.541887,0.593371,-1.747319,-1.316276,-1.053717,0.167324,-0.84931,-0.453125,-1.06133,0.080495,-1.354076,-0.279063,2015-01-01,1,3,0,1,3
1,2015-01-01 01:00:00+01:00,18.29,-1.51196,-1.584673,-1.609962,0.26106,-1.212162,1.65491,0.583124,0.194055,-1.636335,-1.398474,-1.053717,0.191229,-0.850609,-0.708165,-1.06133,0.154065,-1.350967,-0.429549,2015-01-01,1,3,1,1,3
2,2015-01-01 02:00:00+01:00,16.04,-1.627227,-1.706543,-1.830526,0.027594,-1.233613,1.613009,0.607992,-0.081215,-1.816698,-1.328243,-1.053717,0.224679,-0.853212,-1.447781,-1.06133,0.173757,-1.350967,-0.56155,2015-01-01,1,3,2,1,3
3,2015-01-01 03:00:00+01:00,14.6,-1.724334,-1.740572,-1.827996,-0.284682,-1.206749,1.61507,0.599119,-1.029911,-1.879682,-1.283992,-1.053717,0.225763,-0.846715,-1.626308,-1.06133,0.186581,-1.350967,-0.730333,2015-01-01,1,3,3,1,3
4,2015-01-01 04:00:00+01:00,14.95,-1.762623,-1.724374,-1.825471,-0.270226,-1.195924,1.569734,0.605584,-0.715469,-1.909625,-1.290086,-1.053717,0.23844,-0.842835,-1.677316,-1.06133,0.210017,-1.348385,-0.739482,2015-01-01,1,3,4,1,3


In [27]:
# Splits into training data and test data
nor_train, nor_test = train_test_split(df_normalized_data_with_date, test_size=0.2, random_state=0)
sta_train, sta_test = train_test_split(df_standardized_data_with_date, test_size=0.2, random_state=0)

In [28]:
# Creates independent variables lists
new_all_list = list(sta_train)
new_all_list.remove('Date_CET')
new_all_list.remove('Date')
new_all_list.remove('Day-ahead[Euro/MWh]')

In [29]:
# Test data of independent variables
X_test_nor = nor_test[new_all_list]
X_test_sta = sta_test[new_all_list]

# Test data of dependent variable
y_test_nor = nor_test['Day-ahead[Euro/MWh]']
y_test_sta = sta_test['Day-ahead[Euro/MWh]']

# Training data of independent variables
X_nor = nor_train[new_all_list]
X_sta = sta_train[new_all_list]

# Training data of dependent variable
y_nor = nor_train['Day-ahead[Euro/MWh]']
y_sta = sta_train['Day-ahead[Euro/MWh]']

In [30]:
print('X_test_nor:',X_test_nor.shape)
print('X_test_sta:',X_test_sta.shape)
print('y_test_nor:',y_test_nor.shape)
print('y_test_sta:',y_test_sta.shape)
print('X_nor:',X_nor.shape)
print('X_sta:',X_sta.shape)
print('y_nor:',y_nor.shape)
print('y_sta:',y_sta.shape)

X_test_nor: (8529, 23)
X_test_sta: (8529, 23)
y_test_nor: (8529,)
y_test_sta: (8529,)
X_nor: (34114, 23)
X_sta: (34114, 23)
y_nor: (34114,)
y_sta: (34114,)


In [31]:
X_sta.dtypes

Load_actual[MWh]                    float64
Load_forecast[MWh]                  float64
Fossil_gas_actual[MWh]              float64
Fossil_brown_coal_actual[MWh]       float64
Fossil_hard_coal_actual[MWh]        float64
Nuclear_actual[MWh]                 float64
Other_conventional_actual[MWh]      float64
Hydro_pumped_storage_actual[MWh]    float64
Hydropower_actual[MWh]              float64
Biomass_actual[MWh]                 float64
Photovoltaics_actual[MWh]           float64
Wind_onshore_actual[MWh]            float64
Wind_offshore_actual[MWh]           float64
Other_renewable_actual[MWh]         float64
Photovoltaics_forecast[MWh]         float64
Wind_onshore_forecast[MWh]          float64
Wind_offshore_forecast[MWh]         float64
Other_forecast[MWh]                 float64
Month                                 int64
Day_of_week                           int64
Time                                  int64
Weekend_with_holidays                 int64
Seasons                         

# Feature selection 

In [32]:
# Variance Inflation Factor (VIF)
# If 10 > VIF, it could be multicollineariy
vif = pd.DataFrame()
vif["features"] = X_sta.columns
vif["VIF Factor"] = [round(variance_inflation_factor(X_sta.values, i),2) for i in range(X_sta.shape[1])]
vif

Unnamed: 0,features,VIF Factor
0,Load_actual[MWh],31.06
1,Load_forecast[MWh],31.0
2,Fossil_gas_actual[MWh],3.1
3,Fossil_brown_coal_actual[MWh],2.71
4,Fossil_hard_coal_actual[MWh],5.82
5,Nuclear_actual[MWh],2.0
6,Other_conventional_actual[MWh],2.8
7,Hydro_pumped_storage_actual[MWh],1.65
8,Hydropower_actual[MWh],1.66
9,Biomass_actual[MWh],2.63


In [33]:
# Creates list of droppig features as background knowledge 
# If forecast data exists, the actual data removes
remove_list = list(['Load_actual[MWh]','Photovoltaics_actual[MWh]', 'Wind_onshore_actual[MWh]',
                    'Wind_offshore_actual[MWh]', 'Other_renewable_actual[MWh]'])

In [34]:
# Shows VIF without above remove_list features
vif = pd.DataFrame()
vif["features"] = X_sta.drop(remove_list,axis=1).columns
vif["VIF Factor"] = [round(variance_inflation_factor(X_sta.drop(remove_list,axis=1).values, i),2) for i in range(X_sta.drop(remove_list,axis=1).shape[1])]
vif

Unnamed: 0,features,VIF Factor
0,Load_forecast[MWh],9.29
1,Fossil_gas_actual[MWh],3.03
2,Fossil_brown_coal_actual[MWh],2.6
3,Fossil_hard_coal_actual[MWh],5.33
4,Nuclear_actual[MWh],1.92
5,Other_conventional_actual[MWh],2.73
6,Hydro_pumped_storage_actual[MWh],1.6
7,Hydropower_actual[MWh],1.6
8,Biomass_actual[MWh],1.8
9,Photovoltaics_forecast[MWh],2.73


In [35]:
remove_list2 = list(['Load_actual[MWh]','Photovoltaics_actual[MWh]', 'Wind_onshore_actual[MWh]',
                    'Wind_offshore_actual[MWh]', 'Other_renewable_actual[MWh]','Other_forecast[MWh]'])

In [36]:
# Shows VIF without above remove_list2 features
vif = pd.DataFrame()
vif["features"] = X_sta.drop(remove_list2,axis=1).columns
vif["VIF Factor"] = [round(variance_inflation_factor(X_sta.drop(remove_list2,axis=1).values, i),2) for i in range(X_sta.drop(remove_list2,axis=1).shape[1])]
vif

Unnamed: 0,features,VIF Factor
0,Load_forecast[MWh],6.1
1,Fossil_gas_actual[MWh],2.97
2,Fossil_brown_coal_actual[MWh],2.26
3,Fossil_hard_coal_actual[MWh],3.75
4,Nuclear_actual[MWh],1.46
5,Other_conventional_actual[MWh],2.29
6,Hydro_pumped_storage_actual[MWh],1.59
7,Hydropower_actual[MWh],1.6
8,Biomass_actual[MWh],1.79
9,Photovoltaics_forecast[MWh],2.67


# Applying machine learning models

In [37]:
# Calculate ML model's Evaluation
def model_score(X,y,model):
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    score_adjusted_r2 = []
    score_rmse = []
    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        r2_score_result = r2_score(y_val_fold, y_pred)
        adjusted_r2_score_result = 1 - (1-r2_score_result)*(len(y_val_fold)-1)/(len(y_val_fold)-X_train_fold.shape[1]-1)
        rmse_score_result = np.sqrt(mean_squared_error(y_val_fold, y_pred))
        score_adjusted_r2.append(adjusted_r2_score_result)
        score_rmse.append(rmse_score_result)
    avg_adjusted_r2_score = round(np.mean(score_adjusted_r2),4)
    avg_rmse_score = round(np.mean(score_rmse),4)
    print('Adj_R^2:', avg_adjusted_r2_score, '\nRMSE:', avg_rmse_score)

## Linear regression

In [38]:
%%time
# Apply ML with all features
model = linear_model.LinearRegression()
print('-----Normalization-----')
model_score(X_nor,y_nor,model)
print('-----Standardization-----')
model_score(X_sta,y_sta,model)

-----Normalization-----
Adj_R^2: 0.6682 
RMSE: 9.2693
-----Standardization-----
Adj_R^2: 0.6682 
RMSE: 9.2693
CPU times: user 461 ms, sys: 29.3 ms, total: 490 ms
Wall time: 277 ms


In [39]:
%%time
# Apply ML without remove_list features
model = linear_model.LinearRegression()
print('-----Normalization-----')
model_score(X_nor.drop(remove_list,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.664 
RMSE: 9.331
-----Standardization-----
Adj_R^2: 0.664 
RMSE: 9.331
CPU times: user 373 ms, sys: 22.7 ms, total: 396 ms
Wall time: 219 ms


In [40]:
%%time
# Apply ML without remove_list2 features
model = linear_model.LinearRegression()
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
-----Standardization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
CPU times: user 339 ms, sys: 20.2 ms, total: 359 ms
Wall time: 199 ms


Adj_R^2 and RMSE did not change with all features and without remove_list2 features. Thus, data without remove_list2 features are applied to other MLs.

## LASSO regression

In [41]:
%%time
# Apply ML without b features after tuning
model = linear_model.Lasso()
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.398 
RMSE: 12.4923
-----Standardization-----
Adj_R^2: 0.6279 
RMSE: 9.8208
CPU times: user 405 ms, sys: 23.5 ms, total: 429 ms
Wall time: 236 ms


In [42]:
# Tuning parameters/ finding the best configurations
X_test = X_sta.drop(remove_list2,axis=1).copy()
y_test = y_sta.copy()
X_train, X_test, Y_train, Y_test = train_test_split(X_test, y_test, train_size = 0.7, test_size = 0.3, random_state = 0)
    
def parameters_grid(parameter,model):
    param_grid = parameter
    gs = GridSearchCV(model, param_grid)
    grid_result = gs.fit(X_train,Y_train)
    print('Best Params: ', grid_result.best_params_)

In [43]:
parameter = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
model = linear_model.Lasso()
parameters_grid(parameter,model)

Best Params:  {'alpha': 0.001}


In [44]:
%%time
# Apply ML without b features after tuning
model = linear_model.Lasso(alpha=0.001)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
-----Standardization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
CPU times: user 723 ms, sys: 24.7 ms, total: 748 ms
Wall time: 398 ms


## Ridge regression

In [45]:
%%time
# Apply ML without b features after tuning
model = linear_model.Ridge()
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
-----Standardization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
CPU times: user 274 ms, sys: 19.2 ms, total: 294 ms
Wall time: 162 ms


In [46]:
parameter = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
model = linear_model.Ridge()
parameters_grid(parameter,model)

Best Params:  {'alpha': 10}


In [47]:
%%time
# Apply ML without b features after tuning
model = linear_model.Ridge(alpha=10)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.6633 
RMSE: 9.342
-----Standardization-----
Adj_R^2: 0.6633 
RMSE: 9.3415
CPU times: user 258 ms, sys: 19.1 ms, total: 277 ms
Wall time: 156 ms


## Polynomial regression

In [48]:
# Finds the best degrees for polynomial regression
for i in range(1,4):
    polynomial_features = PolynomialFeatures(degree = i)
    x_poly = polynomial_features.fit_transform(X_train)
    x_poly_test = polynomial_features.fit_transform(X_test)
    model = linear_model.LinearRegression()
    model.fit(x_poly, Y_train)
    y_poly_pred = model.predict(x_poly_test)
    r2 = r2_score(Y_test,y_poly_pred)
    print('r^2 for degree =',i,'is',r2)

r^2 for degree = 1 is 0.6656919279533051
r^2 for degree = 2 is 0.79086170481622
r^2 for degree = 3 is 0.8748745548535118


In [49]:
def poly_score(X,y,i):
    polynomial_features= PolynomialFeatures(degree=i)
    x_poly = polynomial_features.fit_transform(X)
    x_poly = pd.DataFrame(x_poly)
    model = linear_model.LinearRegression()
    model_score(x_poly,y,model)

In [50]:
%%time
print('-----Normalization-----')
poly_score(X_nor.drop(remove_list2,axis=1),y_nor,2)
print('-----Standardization-----')
poly_score(X_sta.drop(remove_list2,axis=1),y_sta,2)

-----Normalization-----
Adj_R^2: 0.7835 
RMSE: 7.4052
-----Standardization-----
Adj_R^2: 0.7835 
RMSE: 7.4052
CPU times: user 5.28 s, sys: 312 ms, total: 5.59 s
Wall time: 3.06 s


In [51]:
%%time
print('-----Normalization-----')
poly_score(X_nor.drop(remove_list2,axis=1),y_nor,3)
print('-----Standardization-----')
poly_score(X_sta.drop(remove_list2,axis=1),y_sta,3)

-----Normalization-----
Adj_R^2: 0.8516 
RMSE: 5.6663
-----Standardization-----
Adj_R^2: 0.8516 
RMSE: 5.6663
CPU times: user 1min 34s, sys: 4.14 s, total: 1min 38s
Wall time: 58.8 s


In [52]:
# Tuning parameters/ finding the best configurations
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), linear_model.LinearRegression(**kwargs))
param_grid = {'polynomialfeatures__interaction_only':[True, False],
              'polynomialfeatures__include_bias':[True, False],'polynomialfeatures__order':['C','F'],
              'linearregression__fit_intercept': [True, False], 'linearregression__normalize': [True, False]}
poly_grid = GridSearchCV(PolynomialRegression(), param_grid, cv=3)
grid_result = poly_grid.fit(X_train, Y_train)
print('Best Params: ', grid_result.best_params_)

Best Params:  {'linearregression__fit_intercept': True, 'linearregression__normalize': True, 'polynomialfeatures__include_bias': False, 'polynomialfeatures__interaction_only': False, 'polynomialfeatures__order': 'C'}


In [53]:
def poly_score_with_tuning(X,y,degree):
    polynomial_features= PolynomialFeatures(degree=degree, interaction_only=False, include_bias=False, order='C')
    x_poly = polynomial_features.fit_transform(X)
    x_poly = pd.DataFrame(x_poly)
    model = linear_model.LinearRegression(fit_intercept=True, normalize=True) # these parameters are default
    model_score(x_poly,y,model)

In [54]:
%%time
print('-----Normalization-----')
poly_score_with_tuning(X_nor.drop(remove_list2,axis=1),y_nor,2)
print('-----Standardization-----')
poly_score_with_tuning(X_sta.drop(remove_list2,axis=1),y_sta,2)

-----Normalization-----
Adj_R^2: 0.7835 
RMSE: 7.4052
-----Standardization-----
Adj_R^2: 0.7835 
RMSE: 7.4053
CPU times: user 5.55 s, sys: 308 ms, total: 5.86 s
Wall time: 3.11 s


In [55]:
%%time
print('-----Normalization-----')
poly_score_with_tuning(X_nor.drop(remove_list2,axis=1),y_nor,3)
print('-----Standardization-----')
poly_score_with_tuning(X_sta.drop(remove_list2,axis=1),y_sta,3)

-----Normalization-----
Adj_R^2: 0.8516 
RMSE: 5.6664
-----Standardization-----
Adj_R^2: 0.8516 
RMSE: 5.6664
CPU times: user 1min 32s, sys: 4.08 s, total: 1min 36s
Wall time: 56.4 s


In [56]:
def poly_score_with_tuning2(X,y,degree):
    polynomial_features= PolynomialFeatures(degree=degree, interaction_only=True, include_bias=True, order='C')
    x_poly = polynomial_features.fit_transform(X)
    x_poly = pd.DataFrame(x_poly)
    model = linear_model.LinearRegression(fit_intercept=True, normalize=True) # these parameters are default
    model_score(x_poly,y,model)
    print(len(model.coef_))

In [57]:
%%time
print('-----Normalization-----')
poly_score_with_tuning2(X_nor.drop(remove_list2,axis=1),y_nor,2)
print('-----Standardization-----')
poly_score_with_tuning2(X_sta.drop(remove_list2,axis=1),y_sta,2)

-----Normalization-----
Adj_R^2: 0.7698 
RMSE: 7.6453
-----Standardization-----
Adj_R^2: 0.7698 
RMSE: 7.6453
CPU times: user 4.89 s, sys: 183 ms, total: 5.07 s
Wall time: 2.69 s


In [58]:
%%time
print('-----Normalization-----')
poly_score_with_tuning2(X_nor.drop(remove_list2,axis=1),y_nor,3)
print('-----Standardization-----')
poly_score_with_tuning2(X_sta.drop(remove_list2,axis=1),y_sta,3)

-----Normalization-----
Adj_R^2: 0.8235 
RMSE: 6.3437
-----Standardization-----
Adj_R^2: 0.8235 
RMSE: 6.3437
CPU times: user 55.1 s, sys: 2.8 s, total: 57.9 s
Wall time: 34.4 s


## K-nearest neighbor regression

In [59]:
# Find the best k value
score_val = []
for k in range(1,11):
    model = neighbors.KNeighborsRegressor(n_neighbors = k)
    model.fit(X_train, Y_train) 
    Y_pred=model.predict(X_test) 
    r2_score_result = r2_score(Y_test, Y_pred)
    score_val.append(r2_score_result)
    print('r2 for k=' , k , 'is', round(r2_score_result,4))

r2 for k= 1 is 0.8499
r2 for k= 2 is 0.8739
r2 for k= 3 is 0.8714
r2 for k= 4 is 0.866
r2 for k= 5 is 0.8601
r2 for k= 6 is 0.8546
r2 for k= 7 is 0.8482
r2 for k= 8 is 0.846
r2 for k= 9 is 0.8423
r2 for k= 10 is 0.8381


In [60]:
%%time
model = neighbors.KNeighborsRegressor(n_neighbors=3)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.5389 
RMSE: 10.9311
-----Standardization-----
Adj_R^2: 0.8851 
RMSE: 5.4565
CPU times: user 4.67 s, sys: 26.6 ms, total: 4.7 s
Wall time: 4.7 s


In [61]:
parameter = {'weights':['uniform','distance'], 'p':[1,2,3], 
             'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],'leaf_size':[10,20,30]}
model = neighbors.KNeighborsRegressor(n_neighbors=3)
parameters_grid(parameter,model)

Best Params:  {'algorithm': 'auto', 'leaf_size': 10, 'p': 1, 'weights': 'distance'}


In [63]:
%%time
model = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance', p=1, leaf_size=10)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.7784 
RMSE: 7.5789
-----Standardization-----
Adj_R^2: 0.932 
RMSE: 4.1964
CPU times: user 8.9 s, sys: 42.9 ms, total: 8.95 s
Wall time: 8.96 s


## Support vector regression

In [64]:
%%time
model = svm.SVR()
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.5086 
RMSE: 11.2862
-----Standardization-----
Adj_R^2: 0.6915 
RMSE: 8.9427
CPU times: user 7min 36s, sys: 3.93 s, total: 7min 40s
Wall time: 7min 41s


In [65]:
def parameters_manual(model):
    estimator = model
    estimator.fit(X_train, Y_train)
    y_pred = estimator.predict(X_test)
    r2 = r2_score(Y_test, y_pred)
    adjusted_r2 = 1 - (1-r2)*(len(Y_test)-1)/(len(Y_test)-X_train.shape[1]-1)
    print('Adj_R^2:',round(adjusted_r2,4))

In [66]:
for i in [3,5,7,9]:
    parameters_manual(svm.SVR(gamma='auto', C=3, epsilon=i, tol=1))

Adj_R^2: 0.8176
Adj_R^2: 0.8108
Adj_R^2: 0.7987
Adj_R^2: 0.7805


In [67]:
for i in [3,5,7,9]:
    parameters_manual(svm.SVR(gamma='auto', C=5, epsilon=i, tol=1))

Adj_R^2: 0.8417
Adj_R^2: 0.8352
Adj_R^2: 0.8213
Adj_R^2: 0.8034


In [68]:
for i in [3,5,7,9]:
    parameters_manual(svm.SVR(gamma='auto', C=7, epsilon=i, tol=1))

Adj_R^2: 0.8557
Adj_R^2: 0.848
Adj_R^2: 0.8343
Adj_R^2: 0.8159


In [69]:
for i in [3,5,7,9]:
    parameters_manual(svm.SVR(gamma='auto', C=9, epsilon=i, tol=1))

Adj_R^2: 0.8654
Adj_R^2: 0.8563
Adj_R^2: 0.8429
Adj_R^2: 0.8254


In [70]:
%%time
model = svm.SVR(kernel='rbf', gamma='auto', C=9, epsilon=9,tol=1)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.7148 
RMSE: 8.5988
-----Standardization-----
Adj_R^2: 0.8313 
RMSE: 6.6126
CPU times: user 1min 7s, sys: 1.52 s, total: 1min 9s
Wall time: 1min 9s


## Decision tree

In [85]:
# Finds the best max_depth value
for i in range(1,21):
    model = DecisionTreeRegressor(max_depth = i, random_state=5)
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    r2 = r2_score(Y_test, Y_pred)
    print('r^2 for depth =',i,'is',r2)

r^2 for depth = 1 is 0.27684976832242747
r^2 for depth = 2 is 0.46096542839851795
r^2 for depth = 3 is 0.549856339150795
r^2 for depth = 4 is 0.6194224841323593
r^2 for depth = 5 is 0.6674210235370288
r^2 for depth = 6 is 0.7102950497916538
r^2 for depth = 7 is 0.7353050324921583
r^2 for depth = 8 is 0.7668997578477156
r^2 for depth = 9 is 0.7859824175424858
r^2 for depth = 10 is 0.797960634553855
r^2 for depth = 11 is 0.8118715837988022
r^2 for depth = 12 is 0.8180623172416275
r^2 for depth = 13 is 0.8174491518029942
r^2 for depth = 14 is 0.8138222453595874
r^2 for depth = 15 is 0.814535477258898
r^2 for depth = 16 is 0.8081846906412253
r^2 for depth = 17 is 0.8115340806431228
r^2 for depth = 18 is 0.8042958931720383
r^2 for depth = 19 is 0.8096439324549878
r^2 for depth = 20 is 0.8037651845712372


In [86]:
%%time
model = DecisionTreeRegressor(max_depth = 16, random_state=5)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.8275 
RMSE: 6.6846
-----Standardization-----
Adj_R^2: 0.8275 
RMSE: 6.6846
CPU times: user 4.85 s, sys: 50.2 ms, total: 4.9 s
Wall time: 5.08 s


In [73]:
parameter = {'splitter':['best', 'random'], 'max_features':['auto', 'sqrt', 'log2',None]}
model = DecisionTreeRegressor(max_depth = 16, random_state=0)
parameters_grid(parameter,model)

Best Params:  {'max_features': 'auto', 'splitter': 'best'}


In [74]:
%%time
model = DecisionTreeRegressor(max_depth = 16,splitter='best', max_features= 'auto', random_state=0)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.8286 
RMSE: 6.6659
-----Standardization-----
Adj_R^2: 0.8286 
RMSE: 6.6659
CPU times: user 4.62 s, sys: 27.2 ms, total: 4.65 s
Wall time: 4.65 s


In [75]:
%%time
model = DecisionTreeRegressor(max_depth = 16,splitter='random', max_features= 'auto', random_state=0)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.8179 
RMSE: 6.8682
-----Standardization-----
Adj_R^2: 0.8179 
RMSE: 6.8682
CPU times: user 988 ms, sys: 21.5 ms, total: 1.01 s
Wall time: 1.01 s


In [76]:
parameter = {'min_samples_split':[2,4,6,8,10], 'min_samples_leaf':[None,2,4,6,8,10]}
model = DecisionTreeRegressor(max_depth = 16,splitter='random',random_state=0)
parameters_grid(parameter,model)

Best Params:  {'min_samples_leaf': 2, 'min_samples_split': 8}


In [87]:
%%time
model = DecisionTreeRegressor(max_depth = 16,splitter='random', max_features='auto',random_state=0,
                              min_samples_split= 8,min_samples_leaf = 2)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.8354 
RMSE: 6.5284
-----Standardization-----
Adj_R^2: 0.8354 
RMSE: 6.5284
CPU times: user 811 ms, sys: 17.8 ms, total: 829 ms
Wall time: 847 ms


## Random forest

In [78]:
# Finds the best n_estimators value
for i in range(1,16):
    model = RandomForestRegressor(n_estimators = i, random_state=0)
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    r2 = r2_score(Y_test, Y_pred)
    print('r^2 for depth =',i,'is',r2)

r^2 for depth = 1 is 0.7605352842163745
r^2 for depth = 2 is 0.8451448762009024
r^2 for depth = 3 is 0.8686613385623274
r^2 for depth = 4 is 0.8802134045805494
r^2 for depth = 5 is 0.8878975302250235
r^2 for depth = 6 is 0.8921686436899764
r^2 for depth = 7 is 0.8970161824078045
r^2 for depth = 8 is 0.9004619977600778
r^2 for depth = 9 is 0.900964698060687
r^2 for depth = 10 is 0.9016978449519385
r^2 for depth = 11 is 0.902856981076061
r^2 for depth = 12 is 0.904519816867833
r^2 for depth = 13 is 0.9056370134828435
r^2 for depth = 14 is 0.9069267504113016
r^2 for depth = 15 is 0.9079398711750583


In [79]:
%%time
# Apply ML without b features
model = RandomForestRegressor(n_estimators = 6, random_state=0)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.9027 
RMSE: 5.023
-----Standardization-----
Adj_R^2: 0.9027 
RMSE: 5.0231
CPU times: user 18.9 s, sys: 115 ms, total: 19 s
Wall time: 19 s


In [80]:
parameter = {'bootstrap': [True, False], 'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10]}
model = RandomForestRegressor(n_estimators = 6, random_state=0)
parameters_grid(parameter,model)

Best Params:  {'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}


In [81]:
%%time
model = RandomForestRegressor(n_estimators = 6, min_samples_leaf=1, min_samples_split=2,
                              bootstrap=True,max_features='sqrt',random_state=0)
print('-----Normalization-----')
model_score(X_nor.drop(remove_list2,axis=1),y_nor,model)
print('-----Standardization-----')
model_score(X_sta.drop(remove_list2,axis=1),y_sta,model)

-----Normalization-----
Adj_R^2: 0.9039 
RMSE: 4.9899
-----Standardization-----
Adj_R^2: 0.9039 
RMSE: 4.99
CPU times: user 5.62 s, sys: 94.6 ms, total: 5.71 s
Wall time: 5.72 s


## Neural network

In [82]:
# Calculate ML model's Evaluation
def model_score_NN(X,y,model,epochs,batch_size,early_stopping):
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    score_adjusted_r2 = []
    score_rmse = []
    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train_fold, y_train_fold,epochs=epochs,batch_size=batch_size,verbose=0,callbacks=[early_stopping])
        y_pred = model.predict(X_val_fold)
        r2_score_result = r2_score(y_val_fold, y_pred)
        adjusted_r2_score_result = 1 - (1-r2_score_result)*(len(y_val_fold)-1)/(len(y_val_fold)-X_train_fold.shape[1]-1)
        rmse_score_result = np.sqrt(mean_squared_error(y_val_fold, y_pred))
        score_adjusted_r2.append(adjusted_r2_score_result)
        score_rmse.append(rmse_score_result)
    avg_adjusted_r2_score = round(np.mean(score_adjusted_r2),4)
    avg_rmse_score = round(np.mean(score_rmse),4)
    print('Adj_R^2:', avg_adjusted_r2_score, '\nRMSE:', avg_rmse_score)

In [83]:
def build_model_test():
    model = keras.Sequential([
        keras.layers.Dense(64, activation=tf.nn.relu,
                           input_shape=[17]),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(8, activation=tf.nn.relu),
        keras.layers.Dense(1)
                           ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    tf.random.set_seed(0)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    
    return model

In [84]:
%%time 
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='mse', min_delta=0.1, patience=10, verbose=0)
model = build_model_test()
print('-----Normalization-----')
model_score_NN(X_nor.drop(remove_list2,axis=1),y_nor,model,500,2048,early_stopping)
print('-----Standardization-----')
model = build_model_test()
model_score_NN(X_sta.drop(remove_list2,axis=1),y_sta,model,500,2048,early_stopping)

-----Normalization-----
Adj_R^2: 0.8154 
RMSE: 6.9146
-----Standardization-----
Adj_R^2: 0.9024 
RMSE: 5.0279
CPU times: user 43.8 s, sys: 4.81 s, total: 48.6 s
Wall time: 20.4 s


# Applying the best machine learning model with test data

In [85]:
# Calculate ML model's scores
def test_set_model_score(x_train,y_train,x_test,y_test,model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r2_score_result = r2_score(y_test, y_pred)
    adjusted_r2_score_result = round(1 - (1-r2_score_result)*(len(y_test)-1)/(len(y_test)-x_train.shape[1]-1),4)
    rmse_score_result = round(np.sqrt(mean_squared_error(y_test, y_pred)),4)
    print('Adj_R^2:', adjusted_r2_score_result, '\nRMSE:', rmse_score_result)

In [86]:
%%time
model = RandomForestRegressor(n_estimators = 6, bootstrap=True, max_features='sqrt',
                              min_samples_leaf=1, min_samples_split=2)
print('-----Normalization-----')
test_set_model_score(X_nor.drop(remove_list2,axis=1),y_nor,X_test_nor.drop(remove_list2,axis=1),y_test_nor,model)
print('-----Standardization-----')
test_set_model_score(X_sta.drop(remove_list2,axis=1),y_sta,X_test_sta.drop(remove_list2,axis=1),y_test_sta,model)

-----Normalization-----
Adj_R^2: 0.9097 
RMSE: 4.9041
-----Standardization-----
Adj_R^2: 0.9114 
RMSE: 4.8568
CPU times: user 1.5 s, sys: 39.2 ms, total: 1.54 s
Wall time: 1.57 s
