In [3]:
import pandas as pd
import scipy
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
from datetime import datetime
import os

We use XGboost for machine learning. XGboost is not an Sklearn library, but you can combine XGboost with Sklearn to train the model.

In [4]:
# Define our search space for grid search
random_grid = {
    'objective' : ['reg:squarederror'],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'colsample_bytree': [i/10.0 for i in range(1, 3)],
    }

In [5]:
#Define the MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
#Definition functions: if we variables which have more than 60% zeros delete them
def non_zero_variables(df):
    cols = df.columns
    cols_non_0 = []
    for col in cols:
        if ((df[col].eq(0).sum(axis=0)) < 0.4 * (len(df[col]))): #if we have more than 60% zeros
            cols_non_0.append(col)
    return cols_non_0

### Train the models and predict 1-month-ahead GPI values till the date we have the last official GPI:

In [115]:
#Set the last date of the official GPI:
lastoffic = 202103

In [116]:
#Set the last date to forecast the GPI (you can predict up to 6-months-ahead from the last ground-truth data):
lasttrain = 202003

In [None]:
#code for predictions when new ground truth data is available
train_set = 0.5

path = 'data/all_variables_and_GPI_monthly_all_countries/'

for i in os.listdir(path):
    
    if i != '.DS_Store': 
        
        country = i.split('_')[-1].split('.')[0]
         
        if country != 'WE':
            
            print(country)
        
            all_var = pd.read_csv(path + 'all_variables_%s.csv'%country, index_col=0)
            
            #Delete all columns that have more than 60% of their values 0
            variables_non_0 = non_zero_variables(all_var) #Filter the variables that have many zeros
            df_country = all_var[variables_non_0]
            
            df_country = df_country.loc[df_country.index <= lastoffic]

            #Set the target variable
            Y = df_country['GPI']

            #Set the independent variables
            X = df_country.loc[:, df_country.columns != 'GPI']
            
            #Set the training sets:
            Y_train = Y.loc[Y.index <= lasttrain]
            Y_train = Y_train[int(Y_train.shape[0]*train_set):]
            X_train = X.loc[Y.index <= lasttrain]
            X_train = X_train[int(X_train.shape[0]*train_set):]
            
            #Set the test sets
            Y_test = Y.loc[Y.index > lasttrain]
            X_test = X.loc[X.index > lasttrain]
            
            ###Rolling Training:

            #Create a dataframe to add the predictions
            Predictions = pd.DataFrame(columns = ['MonthYear','prediction1'])#we keep the monthyear of the last trainining data

            #Create an empty dataframe for the variables
            all_importances =  pd.DataFrame(columns = ['Variable', 'Importance','MonthYear'])

            k = 0

            for i in range(0, len(X_test)): #We have to do 73 trainings

                print(i)#print the cycle

                tscv = TimeSeriesSplit(n_splits=10).split(X_train)

                l_predictions = []

               #Train the model

                xg_reg = xgb.XGBRegressor() #model to tune #
                                            
                xg_reg_random = GridSearchCV(estimator = xg_reg, param_grid = random_grid,
                                                cv = tscv,  n_jobs = -1)

                #Best model
                model = xg_reg_random.fit(X_train, Y_train)

                #If we have less than 1 data points ahead to predict, then increase the value of the k variable
                if (len(X_test) - i) < 1: 
                    k = k + 1 
                    #print('k:',k)

                #Make the prediction(s)
                for j in range(i, i+1-k):
                    Y_pred = model.predict(X_test.iloc[[j]])
                    l_predictions.append(Y_pred[0])

                #k variable helps me understand how many months ahead I can predict
                if k>0:
                    l_predictions2 = []
                    for l in range(0, k):
                        l_predictions2.append('-')
                    #Concatenate the predictions list and the '-' list to add them on the dataframe
                    l_predictions = l_predictions + l_predictions2
                
                #Concatenate the month and Year of the last training with the prediction list to add them on the dataframe
                l_predictions = [str(X_train.index[-1])] + l_predictions

                #Add monthyear and predictions on the dataframe
                Predictions_length = len(Predictions)
                Predictions.loc[Predictions_length] = l_predictions

                #Variables importance
                importances = model.best_estimator_.feature_importances_ #get the variables importance

                #Match the values with the names of the variables
                dict_variables = {}
                for feat, importance in zip(X_train, importances):
                    dict_variables[feat] = importance

                df_importances = pd.DataFrame(dict_variables.items(), columns=['Variable', 'Importance'])

                df_importances['MonthYear'] = str(X_train.index[-1])#get the last month of the training

                all_importances = all_importances.append(df_importances).reset_index(drop = True) #add to the total df for importance
                
                #Set the new training sets
                X_train = X_train.iloc[1:]
                X_train = X_train.append(X_test.iloc[i])
                Y_train = Y_train[1:]
                Y_train = Y_train.append(Y_test.iloc[[i]])
                
                
            #Save the predictions appending to the old dataframe
            Predictions.to_csv('XGBoost_results/%s_xgb_predictions.csv' %country, mode='a', index=True, header=False)  

            #Save the importances appending to the old dataframe
            all_importances.to_csv('XGBoost_results/%s_xgb_impvar.csv' %country, mode='a', index=True, header=False)  

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print("Time ended - country %s =" %country, current_time)
       

### Forecasts: After the last official gund truth data you can predict GPI 6-months-ahead.

In [142]:
#Set the months ahead you want to predict after the last ground truth values.
preds_ahead = 6

In [143]:
# this will be used for creating the predictions dataframe
col_list = ['prediction' + str(x) for x in range(1, preds_ahead + 1)] 

In [None]:
#code for forecasts

path = 'data/all_variables_and_GPI_monthly_all_countries/'

train_set = 0.5

for i in os.listdir(path):
    
    if i != '.DS_Store':
        
        country = i.split('_')[-1].split('.')[0]
         
        print(country)
        
        if country != 'WE':

            all_var = pd.read_csv(path + 'all_variables_%s.csv'%country, index_col=0)

            #Delete all columns that have more than 60% of their values 0
            variables_non_0 = non_zero_variables(all_var) #Filter the variables that have many zeros
            df_country = all_var[variables_non_0]

            #Divide the data for training and for prediction
            df_country_training = df_country.loc[df_country.index <= lastoffic]
            df_country_future = df_country.loc[df_country.index > lastoffic]

            #Set the target variable
            Y = df_country_training['GPI']

            #Set the independent variables
            X = df_country_training.loc[:, df_country_training.columns != 'GPI']

            #Set the training sets:
            Y_train = Y[int(Y.shape[0]*train_set):]
            X_train = X[int(X.shape[0]*train_set):]


            #Set the data for future prediction
            X_future = df_country_future.loc[:, df_country_future.columns != 'GPI']

            ### Rolling Training:

            #Create a dataframe to add the predictions
            Predictions = pd.DataFrame(columns = ['MonthYear'] + col_list) #create the lenght based on the months to predict
            
            #Create an empty dataframe for the variables
            all_importances =  pd.DataFrame(columns = ['Variable', 'Importance','MonthYear'])

            tscv = TimeSeriesSplit(n_splits=10).split(X_train)

            l_predictions = []

            #Train the model

            xg_reg = xgb.XGBRegressor() #model to tune #
                                            #the default type of importance is gain, i.e. (importance_type='gain')
                                               #n_iter = 20, cv = tscv,  n_jobs = -1)

            xg_reg_random = GridSearchCV(estimator = xg_reg, param_grid = random_grid,
                                                cv = tscv,  n_jobs = -1)

            #Best model
            model = xg_reg_random.fit(X_train, Y_train)

            #Make the prediction(s)
            for j in range(0, preds_ahead): #months-ahead to predict
                Y_pred = model.predict(X_future.iloc[[j]])
                l_predictions.append(Y_pred[0])

            #Concatenate the month and Year of the last training with the prediction list to add them on the dataframe
            l_predictions = [str(X_train.index[-1])] + l_predictions

            #Add monthyear and predictions on the dataframe
            Predictions_length = len(Predictions)
            Predictions.loc[Predictions_length] = l_predictions

            #Variables importance
            importances = model.best_estimator_.feature_importances_ #get the variables importance

            #Match the values with the names of the variables
            dict_variables = {}
            for feat, importance in zip(X_train, importances):
                dict_variables[feat] = importance

            df_importances = pd.DataFrame(dict_variables.items(), columns=['Variable', 'Importance'])

            #Save the predictions
            Predictions.to_csv('XGBoost_forecasts/%s_xgb_predictions.csv' %country)

            #Save the importances
            df_importances.to_csv('XGBoost_forecasts/%s_xgb_impvar.csv' %country)

            #Save the model hyperparameters

            #Print the time the iteration ended
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print("Time ended - country %s =" %country, current_time)
