In [1]:
import datetime
import os 
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import LSTM,Dense,Flatten, Conv1D, MaxPooling1D
from keras.optimizers import Adam, SGD

from catboost import CatBoostRegressor

In [2]:
#stations' data preparation
 
data_folder="data\\"
stations=["BJNM2015_foF2.txt"]


stations_data={}

for st in stations:
    stations_data[st]=pd.read_csv(data_folder+st, sep =',')
    print("\n\n--------------------------"+st+"--------------------------------------\n")
    print(stations_data[st].head(1))
    



--------------------------BJNM2015_foF2.txt--------------------------------------

       Date  DOY  Hour  Height     HourS  HourC      CHIS      CHIC       DNS  \
0  2015-1-1    1  20.0   100.0 -0.866025    0.5  0.039803  0.999208  0.017452   

        DNC  ...  F10.7     DST_t      AP_x  Delta_Kp        DS        DC  \
0  0.999848  ...   10.7 -5.328018  5.078904 -0.011578 -0.001816  0.999998   

         IS  R_Delta_H  EqDistance   foF2  
0  0.017892   -0.16797    4442.496  2.941  

[1 rows x 21 columns]


In [3]:
#Input-Output data preparation,split and scaling for ML Models
hour_intervals=[1,2,12,24,36,48]   # end of this cell 0 is added for original input dataset
hour_steps_for_height=39 #39 is the number of different heights

X_train_base,X_val_base,X_test_base, y_train_base,y_val_base, y_test_base={},{},{},{},{},{}
shift_size=hour_steps_for_height
scaler = MinMaxScaler()
#scaler = StandardScaler()
#scaler =RobustScaler()


for st in stations_data:
    X,y={},{}
    X[st] = stations_data[st].drop(['Date','DOY', 'Hour', 'Height', 'EqDistance','foF2','F10.7'], axis=1).to_numpy()[:-shift_size]
    y[st] = stations_data[st]['foF2'].to_numpy()[shift_size:]

    hours_lcm=np.lcm.reduce(hour_intervals)
    redundancy_index=int(((len(X[st])/hour_steps_for_height)%hours_lcm)*hour_steps_for_height)

    X[st]=X[st][:-redundancy_index]
    y[st]=y[st][:-redundancy_index]

    split_index_lcm=np.floor((len(X[st])/(hour_steps_for_height*hours_lcm))*0.6)
    split_index=int(split_index_lcm*hours_lcm*hour_steps_for_height)

    #train_test split
    X_train_base[st]=X[st][:split_index]
    y_train_base[st]=y[st][:split_index]
    X_test_base[st]=X[st][split_index:]
    y_test_base[st]=y[st][split_index:]

    #validation split--Final result = 0.8 train, 0.1 validation, 0.1 test,

    val_split_lcm=np.floor((len(X_test_base[st])/(hour_steps_for_height*hours_lcm))*0.5)
    val_split_index=int(val_split_lcm*hours_lcm*hour_steps_for_height)

    X_val_base[st]=X_test_base[st][:val_split_index]
    y_val_base[st]=y_test_base[st][:val_split_index]
    X_test_base[st]=X_test_base[st][val_split_index:]
    y_test_base[st]=y_test_base[st][val_split_index:]

   
    print(st)
    print("Train: ",X_train_base[st].shape,y_train_base[st].shape,"Validation:",X_val_base[st].shape,y_val_base[st].shape,"Test: ",X_test_base[st].shape,y_test_base[st].shape,"\n")
    
    # #shuffling
    # X_train_base[st], y_train_base[st] = shuffle( X_train_base[st],y_train_base[st])
    # X_test_base[st], y_test_base[st] = shuffle(X_test_base[st], y_test_base[st])

    #scaling
    scaler.fit(X_train_base[st])
    X_train_base[st] = scaler.transform(X_train_base[st])
    X_val_base[st] = scaler.transform(X_val_base[st])
    X_test_base[st] = scaler.transform(X_test_base[st])

    print("Scaled X_train: ",X_train_base[st][0],"\n") #prints first row of X_train

hour_intervals.insert(0,0)


BJNM2015_foF2.txt
Train:  (202176, 14) (202176,) Validation: (67392, 14) (67392,) Test:  (67392, 14) (67392,) 

Scaled X_train:  [0.0669873  0.75       0.78341677 0.35383082 0.38660358 1.
 1.         0.84524626 0.03140902 0.61429212 1.         1.
 0.         0.11023755] 



In [4]:
# Print or save results
result_array = [['Train', 'Train', 'Train', 'Train','Validation', 'Validation', 'Validation', 'Validation', 'Test', 'Test', 'Test', 'Test'],
          ['MSE', 'R2', 'MAE', 'MAPE','MSE', 'R2', 'MAE', 'MAPE','MSE', 'R2', 'MAE', 'MAPE']]
results_df=pd.DataFrame(index=pd.MultiIndex.from_arrays(result_array,names=["ExperimentData","Metric"]))
save_row_index=0
save_column_index=0


def save_error_to_df(model_name, data_type, mse_result,r2_result,mae_result,mape_result):
    
    results_df.loc[(data_type, 'MSE'), model_name] = mse_result
    results_df.loc[(data_type, 'R2'), model_name] = r2_result
    results_df.loc[(data_type, 'MAE'), model_name] = mae_result
    results_df.loc[(data_type, 'MAPE'), model_name] = mape_result

def printError(model_name, data_type, y_in, pred_in):
    # Calculate ensemble metrics
    print("\n",model_name + " " + data_type +' Model Performance:'+"\n")
    _mse = mean_squared_error(y_in, pred_in)
    _r2 = r2_score(y_in, pred_in)
    _mae = mean_absolute_error(y_in, pred_in)
    _mape = np.mean(np.abs((y_in - pred_in) / y_in)) * 100
    
    print('MSE:', _mse)
    print('R^2:', _r2)
    print('MAE:', _mae)
    print('MAPE:', _mape)
    #print('MSE:', _mse ,'\tR^2:', _r2, '\tMAE:', _mae, '\tMAPE:', _mape)

    save_error_to_df(model_name,data_type,round(_mse,2),round(_r2,2),round(_mae,2),round(_mape,2))

def save_error_to_excel(st_result_header,first_run):
    global save_row_index
    global save_column_index
    st_header_df=pd.DataFrame(data=st_result_header)

    if first_run:
        #Export all results to an excel file
        st_header_df.to_excel(data_folder+"foF2_ML_Experiments.xlsx",startrow=save_row_index,startcol=save_column_index,index=False,header=False)
        save_row_index+=1
      
        with pd.ExcelWriter(data_folder+"foF2_ML_Experiments.xlsx", mode="a",if_sheet_exists="overlay") as writer:
            results_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index)
        save_row_index+=16
    else :
        with pd.ExcelWriter(data_folder+"foF2_ML_Experiments.xlsx", mode="a",if_sheet_exists="overlay") as writer:
            st_header_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index,index=False,header=False)
            save_row_index+=1
            results_df.to_excel(writer,startrow=save_row_index,startcol=save_column_index)
            save_row_index+=16


In [5]:
first_run=True

for hour_interval in hour_intervals:

    X_train, X_val, X_test, y_train, y_val, y_test=X_train_base.copy(),X_val_base.copy(),X_test_base.copy(),y_train_base.copy(),y_val_base.copy(),y_test_base.copy()
  

    for st in stations_data:

        st_result_header=[st[:st.rfind('_')]] #Results' header

        if hour_interval > 0 :
            
            st_result_header=[st[:st.rfind('_')]+"_"+str(hour_interval)+"h"]

            # Create a boolean mask for rows to be dropped
            mask_train = np.zeros(X_train_base[st].shape[0], dtype=bool)
            mask_val = np.zeros(X_val_base[st].shape[0], dtype=bool)
            mask_test = np.zeros(X_test_base[st].shape[0], dtype=bool)
            interval_index=0

            while interval_index < len(X_train_base[st]):
                mask_train[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_train[st]=np.copy(X_train_base[st][mask_train])
            y_train[st]=np.copy(y_train_base[st][mask_train])

            interval_index=0

            while interval_index < len(X_val_base[st]):
                mask_val[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_val[st]=np.copy(X_val_base[st][mask_val])
            y_val[st]=np.copy(y_val_base[st][mask_val])

            
            interval_index=0

            while interval_index < len(X_test_base[st]):
                mask_test[interval_index]=True;
                interval_index+=hour_steps_for_height*hour_interval

            X_test[st]=np.copy(X_test_base[st][mask_test])
            y_test[st]=np.copy(y_test_base[st][mask_test])

        
        
        #Linear Regression Model

        reg = LinearRegression().fit(X_train[st], y_train[st])

        reg_predTrain = reg.predict(X_train[st]) 
        reg_errorTrain = y_train[st] - reg_predTrain
        printError("Linear Regression","Train", y_train[st], reg_predTrain)

        reg_predVal = reg.predict(X_val[st]) 
        reg_errorVal = y_val[st] - reg_predVal
        printError("Linear Regression","Validation", y_val[st], reg_predVal)

        reg_predTest = reg.predict(X_test[st])
        reg_errorTest = y_test[st] - reg_predTest
        printError("Linear Regression","Test", y_test[st], reg_predTest)


        #Decision Tree Model

        dectree = DecisionTreeRegressor(max_depth=10).fit(X_train[st], y_train[st])

        dectree_predTrain = dectree.predict(X_train[st]) 
        dectree_errorTrain = y_train[st] - dectree_predTrain
        printError("Decision Tree","Train", y_train[st], dectree_predTrain)

        dectree_predVal = dectree.predict(X_val[st]) 
        dectree_errorVal = y_val[st] - dectree_predVal
        printError("Decision Tree","Validation", y_val[st], dectree_predVal)

        dectree_predTest = dectree.predict(X_test[st])
        dectree_errorTest = y_test[st] - dectree_predTest
        printError("Decision Tree","Test", y_test[st], dectree_predTest)


        # MLP model

        mlp = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000, random_state=42)
        mlp.fit(X_train[st], y_train[st])

        mlp_predTrain = mlp.predict(X_train[st])
        mlp_errorTrain = y_train[st] - mlp_predTrain
        printError("MLP","Train", y_train[st], mlp_predTrain)

        mlp_predVal = mlp.predict(X_val[st])
        mlp_errorVal = y_val[st] - mlp_predVal
        printError("MLP","Validation", y_val[st], mlp_predVal)

        mlp_predTest = mlp.predict(X_test[st])
        mlp_errorTest = y_test[st] - mlp_predTest
        printError("MLP","Test",y_test[st], mlp_predTest)


        #LSTM Model

        X_lstm_train = np.expand_dims(X_train[st], axis = 2)
        X_lstm_val = np.expand_dims(X_val[st], axis = 2)
        X_lstm_test = np.expand_dims(X_test[st], axis = 2)
        y_lstm_train = np.expand_dims(y_train[st], axis = 1)
        y_lstm_val = np.expand_dims(y_val[st], axis = 1)
        y_lstm_test = np.expand_dims(y_test[st], axis = 1)

        model_lstm = Sequential()
        model_lstm.add(LSTM(36, activation='relu', input_shape=(X_lstm_train.shape[1], X_lstm_train.shape[2])))
        model_lstm.add(Dense(12))
        model_lstm.add(Dense(1))
        model_lstm.compile(optimizer=Adam(0.0003), loss='mae', metrics=['mae'])

        lstm_history = model_lstm.fit(X_lstm_train, y_lstm_train, epochs=100,batch_size=13, verbose=2, use_multiprocessing=True)
        
        lstm_predTrain = model_lstm.predict(X_lstm_train) 
        lstm_errorTrain = y_lstm_train - lstm_predTrain
        printError("LSTM","Train", y_lstm_train, lstm_predTrain)

        lstm_predVal = model_lstm.predict(X_lstm_val) 
        lstm_errorVal = y_lstm_val - lstm_predVal
        printError("LSTM","Validation", y_lstm_val, lstm_predVal)

        lstm_predTest = model_lstm.predict(X_lstm_test)
        lstm_errorTest = y_lstm_test - lstm_predTest
        printError("LSTM","Test", y_lstm_test, lstm_predTest)

        

        save_error_to_excel(st_result_header,first_run)
        first_run=False
    

    save_column_index+=10
    save_row_index=0



 Linear Regression Train Model Performance:

MSE: 0.6360687486581759
R^2: 0.8127456871395481
MAE: 0.6353375285665593
MAPE: 9.795772954895943

 Linear Regression Validation Model Performance:

MSE: 0.6247856068195118
R^2: 0.7022831774630556
MAE: 0.6311737845635974
MAPE: 10.765535421599017

 Linear Regression Test Model Performance:

MSE: 5.553187471944743
R^2: -0.09539722658689453
MAE: 1.9690689629047538
MAPE: 48.99741334076079

 Decision Tree Train Model Performance:

MSE: 0.011619194969504651
R^2: 0.9965793880384847
MAE: 0.06557099462309278
MAPE: 0.932982977785516

 Decision Tree Validation Model Performance:

MSE: 0.767541188747541
R^2: 0.6342586618738177
MAE: 0.649076663746802
MAPE: 10.43903425042135

 Decision Tree Test Model Performance:

MSE: 2.920376123051421
R^2: 0.42393950826572313
MAE: 1.5507476433100564
MAPE: 38.214160150769686

 MLP Train Model Performance:

MSE: 0.0036453450876163256
R^2: 0.9989268352030173
MAE: 0.043132422867177646
MAPE: 0.6756943732001859

 MLP Validati