In [None]:
import pandas as pd
import scipy
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
from datetime import datetime
import os

In [None]:
# Define our search space for grid search
random_grid = {
    'objective' : ['reg:squarederror'],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'colsample_bytree': [i/10.0 for i in range(1, 3)],
    }

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def non_zero_variables(df):
    cols = df.columns
    cols_non_0 = []
    for col in cols:
        if ((df[col].eq(0).sum(axis=0)) < 0.4 * (len(df[col]))): #if we have more than 60% zeros
            cols_non_0.append(col)
    return cols_non_0

In [None]:
train_set = 0.5

for i in os.listdir(path + 'all_variables_and_GPI_monthly_all_countries/'):
    
    if i != '.DS_Store':
        
        country = i.split('_')[-1].split('.')[0]
        print(country)

        all_var = pd.read_csv('../../all_variables_and_GPI_monthly_all_countries/'+ i, index_col = 0)

        #Delete all columns that have more than 60% of their values 0
        variables_non_0 = non_zero_variables(all_var) #Filter the variables that have many zeros
        df_country = all_var[variables_non_0]

        #Set the target variable
        Y = df_country['GPI']

        #Set the independent variables
        X = df_country.loc[:, df_country.columns != 'GPI']

        #Set the training sets:
        Y_train = Y[:int(Y.shape[0]*train_set)]
        X_train = X[:int(X.shape[0]*train_set)]

        #Set the test sets
        X_test = X[int(X.shape[0]*train_set):]
        Y_test = Y[int(Y.shape[0]*train_set):]

        ### DYNAMIC TRAINING:

        #Create a dataframe to add the predictions
        Predictions = pd.DataFrame(columns = ['MonthYear','prediction1','prediction2','prediction3','prediction4',
                                          'prediction5','prediction6','prediction7','prediction8','prediction9',
                                          'prediction10','prediction11','prediction12'])

        #Create an empty dataframe for the variables
        all_importances =  pd.DataFrame(columns = ['Variable', 'Importance','MonthYear'])

        k = 0

        for i in range(0, len(X_test)): #We have to do 73 trainings

            print(i)#print the cycle

            tscv = TimeSeriesSplit(n_splits=10).split(X_train)

            l_predictions = []

           #Train the model

            xg_reg = xgb.XGBRegressor() #model to tune


            xg_reg_random = GridSearchCV(estimator = xg_reg, param_grid = random_grid,
                                            cv = tscv,  n_jobs = -1)

            #Best model
            model = xg_reg_random.fit(X_train, Y_train)

            #If we have less than 12 data points ahead to predict, then increase the value of the k variable
            if (len(X_test) - i) < 12: 
                k = k + 1 
                #print('k:',k)

            #Make the prediction(s)
            for j in range(i, i+12-k):
                Y_pred = model.predict(X_test.iloc[[j]])
                l_predictions.append(Y_pred[0])

            #k variable helps me understand how many months ahead I can predict
            if k>0:
                l_predictions2 = []
                for l in range(0, k):
                    l_predictions2.append('-')
                #Concatenate the predictions list and the '-' list to add them on the dataframe
                l_predictions = l_predictions + l_predictions2

            #Concatenate the month and Year of the last training with the prediction list to add them on the dataframe
            l_predictions = [str(X_train.index[-1])] + l_predictions

            #Add monthyear and predictions on the dataframe
            Predictions_length = len(Predictions)
            Predictions.loc[Predictions_length] = l_predictions

            #Variables importance
            importances = model.best_estimator_.feature_importances_ #get the variables importance

            #Match the values with the names of the variables
            dict_variables = {}
            for feat, importance in zip(X_train, importances):
                dict_variables[feat] = importance

            df_importances = pd.DataFrame(dict_variables.items(), columns=['Variable', 'Importance'])

            df_importances['MonthYear'] = str(X_train.index[-1])#get the last month of the training

            all_importances = all_importances.append(df_importances).reset_index(drop = True) #add to the total df for importance


            #Set the new training sets
            X_train = X_train.iloc[1:]
            X_train = X_train.append(X_test.iloc[i])
            Y_train = Y_train[1:]
            Y_train = Y_train.append(pd.Series(Y_test.iloc[i]))
            #print(len(X_train))

        #Create the list to save the result analytics
        Pearson = []
        Rmse = []
        Mape = []

        j = 0
        for column in Predictions.iloc[:,1:]:
            # Select column contents by column name using [] operator
            Pred_col = Predictions[column]
            mask = Pred_col.isin(['-'])
            Preds = Pred_col.loc[-mask]
            test_data = Y_test[j:]
            pearson = scipy.stats.pearsonr(test_data, Preds)[0]
            Pearson.append(pearson)
            rms = sqrt(mean_squared_error(test_data, Preds))
            Rmse.append(rms)
            mape = mean_absolute_percentage_error(test_data, Preds)
            Mape.append(mape)

            j = j+1

        df_results_analytics = pd.DataFrame(list(zip(Pearson, Rmse, Mape)), 
                   columns =['Pearson', 'Rmse', 'Mape'])

        df_results_analytics.to_csv('../../xgb_results/%s_xgb_results.csv' %country)

        #Save the predictions
        Predictions.to_csv('../../xgb_results/%s_xgb_predictions.csv' %country)

        #Save the importances
        all_importances.to_csv('../../xgb_results/%s_xgb_impvar.csv' %country)

        #Print the time the iteration ended
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Time ended - country %s =" %country, current_time)

In [None]:
country = 'YM'
df_results_analytics.to_csv('/Users/vickyvoukelatou/Gdelt/GPI_project/journal/results_onlyevents_filt60/%s_xgb_results_short_training_2y.csv' %country)


## Control pearson

In [None]:
all_codes = []
for i in allcol:
    code = i.split('_')[-1]
    if code not in all_codes:
        all_codes.append(code)

In [None]:
all_var = pd.read_csv(path + 'all_variables_j/all_variables_UK.csv', index_col = 0)
all_var = all_var[all_var.columns.drop(list(all_var.filter(regex='tone')))]

In [None]:
all_gold = all_var[all_var.columns.drop(list(all_var.filter(regex='event')))]

In [None]:
my_dict = {}
for i in all_gold.columns[1:]:
    code = i.split('_')[-1]
    pr = scipy.stats.pearsonr(all_var['event_count_%s'%code], all_var['goldstein_%s'%code])[0]
    my_dict[code]=pr

In [None]:
df = pd.DataFrame(list(my_dict.items()),columns = ['code','corr'])