In [None]:
import datetime
import os 
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import LSTM,Dense,Flatten, Conv1D, MaxPooling1D
from keras.optimizers import Adam, SGD

from catboost import CatBoostRegressor

In [None]:
#stations' data preparation
 
data_folder="data\\"
stations=["POMIO2009_fOF2.txt","MALUKU2009_fOF2.txt","DARW2009_fOF2.txt","SUL2009_fOF2.txt","JOG22009_fOF2.txt"\
          "POMIO2015_fOF2.txt","MALUKU2015_fOF2.txt","DARW2015_fOF2.txt","SUL2015_fOF2.txt","JOG22015_fOF2.txt"\
           "2009_merged_foF2.txt","2015_merged_foF2.txt" ]


stations_data={}

for st in stations:
    stations_data[st]=pd.read_csv(data_folder+st, sep =',')
    print("\n\n--------------------------"+st+"--------------------------------------\n")
    print(stations_data[st].head(1))
    

In [None]:
#Input-Output data preparation,split and scaling for ML Models
hour_intervals=[1,2,12,24,36,48]   # end of this cell 0 is added for original input dataset
hour_steps_for_height=39 #39 is the number of different heights

X_train_base,X_val_base,X_test_base, y_train_base,y_val_base, y_test_base={},{},{},{},{},{}
shift_size=hour_steps_for_height
scaler = MinMaxScaler()
#scaler = StandardScaler()
#scaler =RobustScaler()


for st in stations_data:
    X,y={},{}
    X[st] = stations_data[st].drop(['Date', 'Hour', 'Height', 'foF2'], axis=1).to_numpy()[:-shift_size]
    y[st] = stations_data[st]['foF2'].to_numpy()[shift_size:]

    hours_lcm=np.lcm.reduce(hour_intervals)
    redundancy_index=int(((len(X[st])/hour_steps_for_height)%hours_lcm)*hour_steps_for_height)

    X[st]=X[st][:-redundancy_index]
    y[st]=y[st][:-redundancy_index]

    split_index_lcm=np.floor((len(X[st])/(hour_steps_for_height*hours_lcm))*0.8)
    split_index=int(split_index_lcm*hours_lcm*hour_steps_for_height)

    #train_test split
    X_train_base[st]=X[st][:split_index]
    y_train_base[st]=y[st][:split_index]
    X_test_base[st]=X[st][split_index:]
    y_test_base[st]=y[st][split_index:]

    #validation split--Final result = 0.8 train, 0.1 validation, 0.1 test,

    val_split_lcm=np.floor((len(X_test_base[st])/(hour_steps_for_height*hours_lcm))*0.5)
    val_split_index=int(val_split_lcm*hours_lcm*hour_steps_for_height)

    X_val_base[st]=X_test_base[st][:val_split_index]
    y_val_base[st]=y_test_base[st][:val_split_index]
    X_test_base[st]=X_test_base[st][val_split_index:]
    y_test_base[st]=y_test_base[st][val_split_index:]

   
    print(st)
    print("Train: ",X_train_base[st].shape,y_train_base[st].shape,"Validation:",X_val_base[st].shape,y_val_base[st].shape,"Test: ",X_test_base[st].shape,y_test_base[st].shape,"\n")
    
    # #shuffling
    # X_train_base[st], y_train_base[st] = shuffle( X_train_base[st],y_train_base[st])
    # X_test_base[st], y_test_base[st] = shuffle(X_test_base[st], y_test_base[st])

    #scaling
    scaler.fit(X_train_base[st])
    X_train_base[st] = scaler.transform(X_train_base[st])
    X_val_base[st] = scaler.transform(X_val_base[st])
    X_test_base[st] = scaler.transform(X_test_base[st])

    print("Scaled X_train: ",X_train_base[st][0],"\n") #prints first row of X_train

hour_intervals.insert(0,0)


In [None]:
# Print or save results
result_array = [['Train', 'Train', 'Train', 'Train','Validation', 'Validation', 'Validation', 'Validation', 'Test', 'Test', 'Test', 'Test'],
          ['MSE', 'R2', 'MAE', 'MAPE','MSE', 'R2', 'MAE', 'MAPE','MSE', 'R2', 'MAE', 'MAPE']]
results_df=pd.DataFrame(index=pd.MultiIndex.from_arrays(result_array,names=["ExperimentData","Metric"]))
save_row_index=0
save_column_index=0


def save_error_to_df(model_name, data_type, mse_result,r2_result,mae_result,mape_result):
    
    results_df.loc[(data_type, 'MSE'), model_name] = mse_result
    results_df.loc[(data_type, 'R2'), model_name] = r2_result
    results_df.loc[(data_type, 'MAE'), model_name] = mae_result
    results_df.loc[(data_type, 'MAPE'), model_name] = mape_result

def printError(model_name, data_type, y_in, pred_in):
    # Calculate ensemble metrics
    print("\n",model_name + " " + data_type +' Model Performance:'+"\n")
    _mse = mean_squared_error(y_in, pred_in)
    _r2 = r2_score(y_in, pred_in)
    _mae = mean_absolute_error(y_in, pred_in)
    _mape = np.mean(np.abs((y_in - pred_in) / y_in)) * 100
    
    print('MSE:', _mse)
    print('R^2:', _r2)
    print('MAE:', _mae)
    print('MAPE:', _mape)
    #print('MSE:', _mse ,'\tR^2:', _r2, '\tMAE:', _mae, '\tMAPE:', _mape)

    save_error_to_df(model_name,data_type,round(_mse,2),round(_r2,2),round(_mae,2),round(_mape,2))

def save_error_to_excel(st_result_header,first_run):
    global save_row_index
    global save_column_index
    st_header_df=pd.DataFrame(data=st_result_header)

    if first_run:
        #Export all results to an excel file
        st_header_df.to_excel(data_folder+"foF2_ML_Experiments.xlsx",startrow=save_row_index,startcol=save_column_index,index=False,header=False)
        save_row_index+=1
      
        with pd.ExcelWriter(data_folder+"foF2_ML_Experiments.xlsx", mode="a",if_sheet_exists="overlay") as writer:
            results_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index)
        save_row_index+=16
    else :
        with pd.ExcelWriter(data_folder+"foF2_ML_Experiments.xlsx", mode="a",if_sheet_exists="overlay") as writer:
            st_header_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index,index=False,header=False)
            save_row_index+=1
            results_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index)
            save_row_index+=16


In [None]:
first_run=True

for hour_interval in hour_intervals:

    X_train, X_val, X_test, y_train, y_val, y_test=X_train_base.copy(),X_val_base.copy(),X_test_base.copy(),y_train_base.copy(),y_val_base.copy(),y_test_base.copy()
  

    for st in stations_data:

        st_result_header=[st[:st.rfind('_')]] #Results' header

        if hour_interval > 0 :
            
            st_result_header=[st[:st.rfind('_')]+"_"+str(hour_interval)+"h"]

            # Create a boolean mask for rows to be dropped
            mask_train = np.zeros(X_train_base[st].shape[0], dtype=bool)
            mask_val = np.zeros(X_val_base[st].shape[0], dtype=bool)
            mask_test = np.zeros(X_test_base[st].shape[0], dtype=bool)
            interval_index=0

            while interval_index < len(X_train_base[st]):
                mask_train[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_train[st]=np.copy(X_train_base[st][mask_train])
            y_train[st]=np.copy(y_train_base[st][mask_train])

            interval_index=0

            while interval_index < len(X_val_base[st]):
                mask_val[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_val[st]=np.copy(X_val_base[st][mask_val])
            y_val[st]=np.copy(y_val_base[st][mask_val])

            
            interval_index=0

            while interval_index < len(X_test_base[st]):
                mask_test[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_test[st]=np.copy(X_test_base[st][mask_test])
            y_test[st]=np.copy(y_test_base[st][mask_test])

        
        
        #Linear Regression Model

        reg = LinearRegression().fit(X_train[st], y_train[st])

        reg_predTrain = reg.predict(X_train[st]) 
        reg_errorTrain = y_train[st] - reg_predTrain
        printError("Linear Regression","Train", y_train[st], reg_predTrain)

        reg_predVal = reg.predict(X_val[st]) 
        reg_errorVal = y_val[st] - reg_predVal
        printError("Linear Regression","Validation", y_val[st], reg_predVal)

        reg_predTest = reg.predict(X_test[st])
        reg_errorTest = y_test[st] - reg_predTest
        printError("Linear Regression","Test", y_test[st], reg_predTest)


        # Gradient Boosting model

        gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
        gb.fit(X_train[st], y_train[st])

        gb_predTrain = gb.predict(X_train[st]) 
        gb_errorTrain = y_train[st] - gb_predTrain
        printError("Gradient Boosting","Train", y_train[st], gb_predTrain)

        gb_predVal = gb.predict(X_val[st]) 
        gb_errorVal = y_val[st] - gb_predVal
        printError("Gradient Boosting","Validation", y_val[st], gb_predVal)

        gb_predTest = gb.predict(X_test[st])
        gb_errorTest = y_test[st] - gb_predTest
        printError("Gradient Boosting","Test", y_test[st], gb_predTest)


        # MLP model

        mlp = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000, random_state=42)
        mlp.fit(X_train[st], y_train[st])

        mlp_predTrain = mlp.predict(X_train[st])
        mlp_errorTrain = y_train[st] - mlp_predTrain
        printError("MLP","Train", y_train[st], mlp_predTrain)

        mlp_predVal = mlp.predict(X_val[st])
        mlp_errorVal = y_val[st] - mlp_predVal
        printError("MLP","Validation", y_val[st], mlp_predVal)

        mlp_predTest = mlp.predict(X_test[st])
        mlp_errorTest = y_test[st] - mlp_predTest
        printError("MLP","Test",y_test[st], mlp_predTest)


        # Random Forest model

        rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
        rf.fit(X_train[st], y_train[st])
        
        rf_predTrain = rf.predict(X_train[st])
        rf_errorTrain = y_train[st] - rf_predTrain
        printError("Random Forest","Train", y_train[st], rf_predTrain)

        rf_predVal = rf.predict(X_val[st])
        rf_errorVal = y_val[st] - rf_predVal
        printError("Random Forest","Validation", y_val[st], rf_predVal)

        rf_predTest = rf.predict(X_test[st])
        rf_errorTest = y_test[st] - rf_predTest
        printError("Random Forest","Test", y_test[st], rf_predTest)


        # CatBoost model

        catboost = CatBoostRegressor(iterations=1000, learning_rate=0.1, random_seed=42)
        catboost.fit(X_train[st], y_train[st], silent=True)
        
        catboost_predTrain = catboost.predict(X_train[st]) 
        catboost_errorTrain = y_train[st] - catboost_predTrain
        printError("CatBoost","Train", y_train[st], catboost_predTrain)

        catboost_predVal = catboost.predict(X_val[st]) 
        catboost_errorVal = y_val[st] - catboost_predVal
        printError("CatBoost","Validation", y_val[st], catboost_predVal)

        catboost_predTest = catboost.predict(X_test[st])
        catboost_errorTest = y_test[st] - catboost_predTest
        printError("CatBoost","Test", y_test[st], catboost_predTest)

        #LSTM Model

        X_lstm_train = np.expand_dims(X_train[st], axis = 2)
        X_lstm_val = np.expand_dims(X_val[st], axis = 2)
        X_lstm_test = np.expand_dims(X_test[st], axis = 2)
        y_lstm_train = np.expand_dims(y_train[st], axis = 1)
        y_lstm_val = np.expand_dims(y_val[st], axis = 1)
        y_lstm_test = np.expand_dims(y_test[st], axis = 1)

        model_lstm = Sequential()
        model_lstm.add(LSTM(36, activation='relu', input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
        model_lstm.add(Dense(12))
        model_lstm.add(Dense(1))
        model_lstm.compile(optimizer=Adam(0.0003), loss='mae', metrics=['mae'])

        lstm_history = model_lstm.fit(X_lstm_train, y_lstm_train, epochs=50,batch_size=78, verbose=2, use_multiprocessing=True)
        
        lstm_predTrain = model_lstm.predict(X_lstm_train) 
        lstm_errorTrain = y_lstm_train - lstm_predTrain
        printError("LSTM","Train", y_lstm_train, lstm_predTrain)

        lstm_predVal = model_lstm.predict(X_lstm_val) 
        lstm_errorVal = y_lstm_val - lstm_predVal
        printError("LSTM","Validation", y_lstm_val, lstm_predVal)

        lstm_predTest = model_lstm.predict(X_lstm_test)
        lstm_errorTest = y_lstm_test - lstm_predTest
        printError("LSTM","Test", y_lstm_test, lstm_predTest)


        # Ensemble model with CNN

        X_ensembleTrain = np.column_stack((reg_predTrain[1:],rf_predTrain[1:], gb_predTrain[1:], mlp_predTrain[1:], catboost_predTrain[1:],lstm_predTrain[1:]))
        X_ensembleValidation = np.column_stack((reg_predVal[1:],rf_predVal[1:], gb_predVal[1:], mlp_predVal[1:], catboost_predVal[1:],lstm_predVal[1:]))
        X_ensembleTest = np.column_stack((reg_predTest[1:],rf_predTest[1:], gb_predTest[1:], mlp_predTest[1:], catboost_predTest[1:],lstm_pred_test[1:]))
        # Reshape the input data from 2D to 3D
        X_ensembleTrain = np.reshape(X_ensembleTrain, (X_ensembleTrain.shape[0], X_ensembleTrain.shape[1], 1))
        X_ensembleValidation = np.reshape(X_ensembleValidation, (X_ensembleValidation.shape[0], X_ensembleValidation.shape[1], 1))
        X_ensembleTest = np.reshape(X_ensembleTest, (X_ensembleTest.shape[0], X_ensembleTest.shape[1], 1))
        
        # Define the CNN model
        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_ensembleTrain.shape[1], 1)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(50, activation='relu'))
        model.add(Dense(1))

        # Compile the model
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        # Train the model
        history = model.fit(X_ensembleTrain, y_train[st][1:], epochs=50, batch_size=32)

        # Evaluate the model on the training and test sets
        cnn_predTrain = model.predict(X_ensembleTrain)
        cnn_predVal = model.predict(X_ensembleValidation)
        cnn_predTest = model.predict(X_ensembleTest)

        printError("CNN","Train", y_train[st][1:].flatten(), cnn_predTrain.flatten())
        printError("CNN","Validation", y_val[st][1:].flatten(), cnn_predVal.flatten())
        printError("CNN","Test",y_test[st][1:].flatten(), cnn_predTest.flatten())

        
        #Ensemble model with CNN+Errors

        # Combine errors from previous models
        error_dfTrain = pd.DataFrame({'reg_error': reg_errorTrain[:-1],'rf_error': rf_errorTrain[:-1], 'gb_error': gb_errorTrain[:-1], 'mlp_error': mlp_errorTrain[:-1],\
                                    'catboost_error': catboost_errorTrain[:-1],'lstm_error':lstm_errorTrain[:-1]})
        
        error_dfVal = pd.DataFrame({'reg_error': reg_errorVal[:-1],'rf_error': rf_errorVal[:-1], 'gb_error': gb_errorVal[:-1], 'mlp_error': mlp_errorVal[:-1],\
                            'catboost_error': catboost_errorVal[:-1],'lstm_error':lstm_errorVal[:-1]})

        error_dfTest = pd.DataFrame({'reg_error': reg_errorTest[:-1],'rf_error': rf_errorTest[:-1], 'gb_error': gb_errorTest[:-1], 'mlp_error': mlp_errorTest[:-1],\
                                'catboost_error': catboost_errorTest[:-1],'lstm_error':lstm_errorTest[:-1]})

        # Ensemble model
        X_ensembleTrain = np.column_stack((reg_predTrain[1:],rf_predTrain[1:], gb_predTrain[1:], mlp_predTrain[1:], catboost_predTrain[1:],lstm_predTrain[1:], error_dfTrain))
        X_ensembleVal = np.column_stack((reg_predVal[1:],rf_predVal[1:], gb_predVal[1:], mlp_predval[1:], catboost_predVal[1:],lstm_predVal[1:], error_dfVal))
        X_ensembleTest = np.column_stack((reg_predTest[1:],rf_predTest[1:], gb_predTest[1:], mlp_predTest[1:], catboost_predTest[1:],lstm_predTest[1:],error_dfTest))
        # Reshape the input data from 2D to 3D
        X_ensembleTrain = np.reshape(X_ensembleTrain, (X_ensembleTrain.shape[0], X_ensembleTrain.shape[1], 1))
        X_ensembleVal = np.reshape(X_ensembleVal, (X_ensembleVal.shape[0], X_ensembleVal.shape[1], 1))
        X_ensembleTest = np.reshape(X_ensembleTest, (X_ensembleTest.shape[0], X_ensembleTest.shape[1], 1))

        # Train the already  generated cnn model
        history = model.fit(X_ensembleTrain, y_train[st][1:], epochs=50, batch_size=32)

        # Evaluate the model on the training and test sets
        cnn_predTrain2 = model.predict(X_ensembleTrain)
        cnn_predVal2 = model.predict(X_ensembleVal)
        cnn_predTest2 = model.predict(X_ensembleTest)

        printError("CNNwithErrors","Train",  y_train[st][1:].flatten(), cnn_predTrain2.flatten())
        printError("CNNwithErrors","Validation",  y_val[st][1:].flatten(), cnn_predVal2.flatten())
        printError("CNNwithErrors","Test",  y_test[st][1:].flatten(), cnn_predTest2.flatten())
        

        save_error_to_excel(st_result_header,first_run)
        first_run=False
    

    save_column_index+=10
    save_row_index=0
