In [6]:
import pandas as pd
import os

In [2]:
# Function to remove variables that have 50% of their values 0
def non_zero_variables(df):
    cols = df.columns
    cols_non_0 = []
    for col in cols:
        if ((df[col].eq(0).sum(axis=0)) < 0.5 * (len(df[col]))):
            cols_non_0.append(col)
    return cols_non_0

In [3]:
# Find the countries that have both GPI and Gdelt values 

countries_gpi = []
for i in os.listdir('../Datasets/Gdelt_Well-being/interpolated_gpi/'):
    if i != '.DS_Store':
        countries_gpi.append((i.split('_')[2].split('.')[0]))
        
countries_gdelt = []
for j in os.listdir('../Datasets/Gdelt_Well-being/gdelt_data/'):
    if j != '.DS_Store':
        countries_gdelt.append(j.split('.')[0])
        
all_countries = [country for country in countries_gdelt if country in countries_gpi] 

In [4]:
## problem with Palestine (West Bank) - WE : not enough GPI data
if 'WE' in all_countries:
    all_countries.remove('WE')

In [5]:
all_variables_df = {}

for country in all_countries:
    # Read Data
    gpi_data = pd.read_csv('../Datasets/Gdelt_Well-being/interpolated_gpi/interpolated_gpi_%s.csv' %country)
    gpi_data.astype({'MonthYear': 'str'}).dtypes
    all_variables_df[country] = pd.DataFrame({ 'MonthYear': gpi_data.MonthYear, 'GPI': gpi_data.GPI_score})
    all_variables_df[country] = all_variables_df[country].set_index('MonthYear')

    
    country_data = pd.read_csv('../Datasets/Gdelt_Well-being/gdelt_data/%s.csv' %country)
    # Remove event codes that are not numeric
    country_data.EventBaseCode = pd.to_numeric(country_data.EventBaseCode, errors='coerce')
    country_data = country_data.dropna()
    country_data = country_data.reset_index(drop=True)
    country_data.EventBaseCode =  country_data.EventBaseCode.astype(int)
    variables = country_data['EventBaseCode'].unique()
    
    # Create the final time series for all the variables
    events_count = {}
    tones_count = {}
    goldsteins = {}

    for v in variables:
        
        single_variable = country_data.loc[country_data.EventBaseCode == v]
        single_variable = single_variable.reset_index(drop = True)

        df_to_print = pd.DataFrame({'MonthYear': single_variable.MonthYear, 'event_count_' + str(v): single_variable.eventcount, 
                 'tone_count_' + str(v): single_variable.tonecount, 'goldstein_' + str(v): single_variable.goldstein})
        df_to_print = df_to_print.set_index('MonthYear')
        all_variables_df[country] = pd.concat([all_variables_df[country], df_to_print], axis=1)
    # Removing the 5 last GDLET rows for which we have no GPI values
    all_variables_df[country].drop(all_variables_df[country].tail(5).index, inplace=True)
    all_variables_df[country] = all_variables_df[country].fillna(0)
    # Filter the variables that have many zeros
    variables_non_0 = non_zero_variables(all_variables_df[country])
    all_variables_df[country] = all_variables_df[country][variables_non_0]
    all_variables_df[country].to_csv(r'../Datasets/Gdelt_Well-being/files_for_R_no_log/all_variables_%s.csv' %(country))
    