In [1]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn import *
from statistics import *

In [2]:
data = pd.read_csv ('firefly1010.csv',sep=',',dtype= np.float64)
x = data.iloc[:, 0:71]
y = data.iloc[:, 71]

In [3]:
#Voting Ensemble
t1 = time.process_time()

models = list()
mlp = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
gbr = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
rf = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)

models.append(('mlp', mlp))
models.append(('gbr', gbr))
models.append(('rf', rf))

n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.VotingRegressor(estimators=models, n_jobs=-1)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
print('Rmse per split: ', rmse)
print('Mae per split: ', mae)
print('R2 per split: ', r2)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  39.407597699  secs 

Rmse per split:  [6.437039154443864, 6.225777979447422, 6.7749446278959065, 6.551665250065692, 6.554372079193454, 6.437039154443865, 6.225777979447422, 6.774944627895906, 6.5573833430990245, 6.554372079193455, 6.437039154443866, 6.225777979447422, 6.7749446278959065, 6.552785433345635, 6.554405125776906, 6.437039154443865, 6.225777979447422, 6.7749446278959065, 6.553096246633328, 6.554449821922119, 6.437039154443866, 6.225777979447423, 6.7749446278959065, 6.553490768720203, 6.554405125776905]
Mae per split:  [4.674963520942608, 4.490153363887318, 4.743165400438216, 4.622146515968602, 4.684767323084304, 4.674963520942608, 4.490153363887317, 4.743165400438216, 4.626233798024877, 4.684767323084304, 4.674963520942608, 4.490153363887319, 4.743165400438216, 4.623065014534891, 4.684767323084305, 4.674963520942608, 4.490153363887319, 4.743165400438216, 4.623306832915766, 4.684767323084306, 4.674963520942608, 4.490153363887318, 4.743165400438216, 4.6236066234444415

In [None]:
#Gradient Boosting
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  317.674751194  secs 

Mean RMSE:  8.222900768123589
Mean MAE:  6.041342622404075
Mean R2:  0.9056319556996586


In [None]:
#Multilayer Perceptron
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  531.568588351  secs 

Mean RMSE:  7.111220173468457
Mean MAE:  5.366307371745769
Mean R2:  0.9294862746663128


In [None]:
#Random Forest
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)
        
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  961.039436066  secs 

Mean RMSE:  7.255718930783827
Mean MAE:  4.817018498965255
Mean R2:  0.9264765837103663


In [4]:
data = pd.read_csv ('firefly2020.csv',sep=',',dtype= np.float64)
x = data.iloc[:, 0:60]
y = data.iloc[:, 60]

In [5]:
#Voting Ensemble
t1 = time.process_time()

models = list()
mlp = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
gbr = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
rf = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)

models.append(('mlp', mlp))
models.append(('gbr', gbr))
models.append(('rf', rf))

n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.VotingRegressor(estimators=models, n_jobs=-1)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
print('Rmse per split: ', rmse)
print('Mae per split: ', mae)
print('R2 per split: ', r2)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  37.955703782  secs 

Rmse per split:  [7.287937534504437, 6.93228576805535, 7.912138735957302, 7.529449483322893, 7.640094359138518, 7.287937534504437, 6.93228576805535, 7.912138735957302, 7.527583704152871, 7.6382274873943405, 7.287937534504436, 6.93228576805535, 7.912138735957302, 7.524688496066304, 7.638227487394341, 7.287937534504437, 6.93228576805535, 7.912138735957302, 7.52758370415287, 7.6382274873943405, 7.287937534504437, 6.93228576805535, 7.912138735957302, 7.529402140537778, 7.638227487394341]
Mae per split:  [5.328387826420326, 4.978219106320811, 5.620107555131817, 5.451126520960062, 5.469797871677958, 5.328387826420327, 4.97821910632081, 5.620107555131817, 5.450256480739838, 5.46921154250366, 5.328387826420326, 4.97821910632081, 5.620107555131817, 5.447299279470889, 5.469211542503661, 5.328387826420326, 4.97821910632081, 5.620107555131817, 5.450256480739838, 5.469211542503661, 5.328387826420327, 4.97821910632081, 5.620107555131817, 5.451630023938986, 5.46921154250

In [None]:
#Gradient Boosting
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  264.510586075  secs 

Mean RMSE:  8.367031529460599
Mean MAE:  6.103736124997102
Mean R2:  0.902240102700158


In [None]:
#Multilayer Perceptron
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  548.8549734690001  secs 

Mean RMSE:  10.309640631172691
Mean MAE:  7.806992081209347
Mean R2:  0.850739719148558


In [None]:
#Random Forest
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)
        
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  837.787787072  secs 

Mean RMSE:  7.416661650407031
Mean MAE:  4.939176057120944
Mean R2:  0.9231796926661473


In [6]:
data = pd.read_csv ('firefly3040.csv',sep=',',dtype= np.float64)
x = data.iloc[:, 0:55]
y = data.iloc[:, 55]

In [7]:
#Voting Ensemble
t1 = time.process_time()

models = list()
mlp = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
gbr = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
rf = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)

models.append(('mlp', mlp))
models.append(('gbr', gbr))
models.append(('rf', rf))

n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.VotingRegressor(estimators=models, n_jobs=-1)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
print('Rmse per split: ', rmse)
print('Mae per split: ', mae)
print('R2 per split: ', r2)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  36.93246503500001  secs 

Rmse per split:  [7.149889665193083, 6.920324996866435, 7.809176772700867, 7.402018931198687, 7.342943254310085, 7.149889665193083, 6.920324996866434, 7.809176772700867, 7.402018931198687, 7.342943254310086, 7.149889665193083, 6.920324996866435, 7.809176772700867, 7.402018931198687, 7.342943254310085, 7.149889665193083, 6.920324996866435, 7.809176772700866, 7.402018931198687, 7.342943254310085, 7.149889665193083, 6.920324996866435, 7.809176772700867, 7.402018931198687, 7.342943254310086]
Mae per split:  [5.415964200382362, 5.065276535684321, 5.529768479671677, 5.45143433761257, 5.288476316388127, 5.415964200382363, 5.065276535684321, 5.529768479671677, 5.45143433761257, 5.288476316388127, 5.415964200382363, 5.065276535684321, 5.529768479671677, 5.451434337612571, 5.288476316388127, 5.415964200382362, 5.065276535684321, 5.529768479671677, 5.45143433761257, 5.288476316388127, 5.415964200382362, 5.065276535684321, 5.529768479671677, 5.451434337612571, 5.

In [None]:
#Gradient Boosting
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  230.02088802100002  secs 

Mean RMSE:  8.594892325172514
Mean MAE:  6.319726098075832
Mean R2:  0.8967680418785272


In [None]:
#Multilayer Perceptron
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  640.1209952669999  secs 

Mean RMSE:  9.323834888682349
Mean MAE:  7.122804901451214
Mean R2:  0.8786282652490014


In [None]:
#Random Forest
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)
        
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  739.5530782029998  secs 

Mean RMSE:  7.664001928722695
Mean MAE:  5.14093761282725
Mean R2:  0.9179024556422714


In [8]:
data = pd.read_csv ('firefly4080.csv',sep=',',dtype= np.float64)
x = data.iloc[:, 0:55]
y = data.iloc[:, 55]

In [9]:
#Voting Ensemble
t1 = time.process_time()

models = list()
mlp = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
gbr = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
rf = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)

models.append(('mlp', mlp))
models.append(('gbr', gbr))
models.append(('rf', rf))

n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.VotingRegressor(estimators=models, n_jobs=-1)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
print('Rmse per split: ', rmse)
print('Mae per split: ', mae)
print('R2 per split: ', r2)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  37.27438719300001  secs 

Rmse per split:  [7.292438218887494, 7.114306364776322, 7.769748714092917, 7.1839262914385005, 7.790159582255189, 7.2924382188874945, 7.114306364776322, 7.769748714092917, 7.1839262914385005, 7.7878567688419675, 7.292438218887494, 7.114306364776322, 7.769748714092917, 7.1839262914385005, 7.788855187220281, 7.292438218887494, 7.114306364776322, 7.769748714092917, 7.1839262914385005, 7.7854989900738145, 7.292438218887494, 7.114306364776322, 7.769748714092917, 7.1839262914385005, 7.786734196452633]
Mae per split:  [5.529200007883979, 5.229339884572641, 5.698340047818731, 5.3179181891045575, 5.65724349005999, 5.529200007883982, 5.229339884572641, 5.698340047818731, 5.317918189104557, 5.65538170522317, 5.52920000788398, 5.229339884572641, 5.698340047818732, 5.317918189104558, 5.656197066805973, 5.52920000788398, 5.229339884572641, 5.698340047818731, 5.3179181891045575, 5.653402781894681, 5.529200007883979, 5.229339884572641, 5.698340047818732, 5.3179181891

In [None]:
#Gradient Boosting
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  245.881248712  secs 

Mean RMSE:  8.47514877313549
Mean MAE:  6.235280121958129
Mean R2:  0.8997830462985574


In [None]:
#Multilayer Perceptron
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  668.029925113  secs 

Mean RMSE:  10.674163284841855
Mean MAE:  8.02508229771389
Mean R2:  0.8404966864518436


In [None]:
#Random Forest
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)
        
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  760.397908121  secs 

Mean RMSE:  7.511804878535899
Mean MAE:  5.008395121991641
Mean R2:  0.9211647601864605


In [10]:
data = pd.read_csv ('firefly50160.csv',sep=',',dtype= np.float64)
x = data.iloc[:, 0:53]
y = data.iloc[:, 53]

In [11]:
#Voting Ensemble
t1 = time.process_time()

models = list()
mlp = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
gbr = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
rf = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)

models.append(('mlp', mlp))
models.append(('gbr', gbr))
models.append(('rf', rf))

n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.VotingRegressor(estimators=models, n_jobs=-1)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
print('Rmse per split: ', rmse)
print('Mae per split: ', mae)
print('R2 per split: ', r2)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  37.421424  secs 

Rmse per split:  [6.561646907456498, 6.479792508661183, 7.007730475701278, 6.783597375013221, 6.731248957084808, 6.561646907456498, 6.479792508661183, 7.007730475701277, 6.783597375013222, 6.731248957084808, 6.561646907456498, 6.479792508661183, 7.007730475701277, 6.783597375013222, 6.731248957084808, 6.561646907456498, 6.479792508661183, 7.007730475701278, 6.783597375013222, 6.731248957084808, 6.561646907456498, 6.479792508661183, 7.007730475701279, 6.783597375013222, 6.731248957084808]
Mae per split:  [4.7886701047091265, 4.6855375508329855, 4.952979260072214, 4.894665753630454, 4.819684832497269, 4.7886701047091265, 4.685537550832985, 4.952979260072214, 4.894665753630454, 4.819684832497269, 4.788670104709126, 4.685537550832985, 4.952979260072214, 4.894665753630455, 4.819684832497269, 4.788670104709126, 4.6855375508329855, 4.952979260072214, 4.894665753630454, 4.81968483249727, 4.788670104709126, 4.6855375508329855, 4.952979260072214, 4.894665753630454, 4.8

In [None]:
#Gradient Boosting
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.GradientBoostingRegressor(n_estimators= 1000, learning_rate= 0.1, loss= 'ls')
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  240.58324537900012  secs 

Mean RMSE:  8.367912315720067
Mean MAE:  6.151712528124268
Mean R2:  0.9023211805232866


In [None]:
#Multilayer Perceptron
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = neural_network.MLPRegressor(hidden_layer_sizes=500, activation='relu', max_iter=500, solver='adam', alpha=0.1, batch_size='auto', 
                                            learning_rate='adaptive', learning_rate_init=0.01, shuffle=True, random_state=7)
    
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  519.272349117  secs 

Mean RMSE:  7.601299074548188
Mean MAE:  5.76664687937883
Mean R2:  0.919421500369997


In [None]:
#Random Forest
t1 = time.process_time()
n_repeat = 5
outer_cv = KFold(n_splits=5, shuffle=True, random_state=7)

rmse = []
mae = []
r2 = []
for i in range(n_repeat):
    for train_idx, test_idx in outer_cv.split(x, y):
        train_data, test_data = x.iloc[train_idx], x.iloc[test_idx]
        train_target = y.iloc[train_idx]
    
        model = ensemble.RandomForestRegressor(n_estimators= 1000, random_state=7)
        
        classifier = model.fit(train_data, train_target)
        y_pred = classifier.predict(test_data)
        rmsee = np.sqrt(metrics.mean_squared_error(y.iloc[test_idx], y_pred))
        maee = metrics.mean_absolute_error(y.iloc[test_idx], y_pred)
        r22 = metrics.r2_score(y.iloc[test_idx], y_pred)
        rmse.append(rmsee)
        mae.append(maee)
        r2.append(r22)
        #print(rmse)
t2 = time.process_time()
print('Time is ', str(t2-t1), ' secs \n')
#print('Rmse per split: ', rmse)
print('Mean RMSE: ', mean(rmse))
print('Mean MAE: ', mean(mae))
print('Mean R2: ', mean(r2))

Time is  719.3374671890001  secs 

Mean RMSE:  7.061538540901014
Mean MAE:  4.750225900324175
Mean R2:  0.9302454927975249
