# Imports 

In [3]:
# General

from functools import reduce

# Data Analysis

import pandas as pd
import numpy as np

# Visualization

import matplotlib.pyplot as plt
import geopandas as gpd


# WBAPI

import wbgapi as wb

# Data Processing

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Aesthetic

import warnings
warnings.filterwarnings('ignore')

class color:
   BOLD = '\033[1m'
   END = '\033[0m'

# Manual Data Collection

In [35]:
vdem_df = pd.read_csv("../data/vdem/V-Dem-CY-Full+Others-v13.csv", usecols=['country_text_id', 'v2regsupgroupssize', 'year'])

owid_freedom_of_expression_df = pd.read_csv('../data/owid/freedom-of-expression-index.csv')

owid_pa_index_df = pd.read_csv('../data/owid/rigorous-and-impartial-public-administration-index.csv')

owid_state_control_df = pd.read_csv('../data/owid/percentage-of-territory-controlled-by-government.csv')

acled_df = pd.read_csv('../data/acled/2023_all_data.csv', usecols= ['event_type', 'country', 'fatalities', 'population_best', 'year'])

nd_df = pd.read_csv('../data/nd/gain.csv')

unhcr_df = pd.read_csv('../data/population.csv')

geo_df = pd.read_csv('../data/geodata.csv')

iso_list = pd.read_csv("../data/all.csv", usecols= ['name', 'alpha-3'])

In [37]:
manual_data = {
    'V-DEM': vdem_df,
    'OWiD': {
        'Public Administration Index': owid_pa_index_df,
        'Freedom of Expression Index': owid_freedom_of_expression_df,
        'State Control over Territory': owid_state_control_df
    },
    'ACLED': acled_df,
    'ND': nd_df,
    'UNHCR': unhcr_df,
    'iso_list': iso_list,
    'geodata': geo_df
}

# Functions

## WB API

In [6]:
def indicator_to_df(query, specify = False):

    # Function to get indicator code

    if specify:
        return pd.DataFrame(wb.series.Series(q= query)).reset_index().iloc[specify]
    return pd.DataFrame(wb.series.Series(q= query)).reset_index()

In [7]:
def wb_data_completer(indicator, coverage_threshold = 0.85, years_to_check = 10, database = None, specify = False):
    
    def indicator_to_df(query, specify = False):
        
        # Function to get indicator code
        
        if specify:
            return pd.DataFrame(wb.series.Series(q= query)).reset_index().iloc[specify]
        return pd.DataFrame(wb.series.Series(q= query)).reset_index()
    
    def fetch_data_and_calculate_completeness(database_number):
        
        # Checks coverage of data
        
        wb.db = database_number
        db_ind = indicator_to_df(indicator, specify = specify)
        
        if len(db_ind) == 0:  # If no data is found for this database
            return 0  # Completeness is 0%
        return float(wb.data.DataFrame(db_ind['index'], mrv=1).notna().mean())
    
    # Check which database to use if not specified
    
    if database is None:
        
        db2_complete = fetch_data_and_calculate_completeness(2)
        db3_complete = fetch_data_and_calculate_completeness(3)
        
        database = 2 if db2_complete >= db3_complete or db3_complete == 0 else 3
    
    # Check coverage of most recent year
    
    wb.db = database
    coverage_complete = fetch_data_and_calculate_completeness(database)
    final_ind = indicator_to_df(indicator, specify = specify)
        
    # Return mrv = 1 if already passing data threshold
    
    if coverage_complete > coverage_threshold:
        print(f"""Data for '{indicator}' found in WB Database {database}. Returning data for the most recent year. 
        Coverage = {round(coverage_complete, 4)*100}%, greater than selected threshold of {round(coverage_threshold, 4)*100}%.\n""")
        final_ind = wb.data.DataFrame(final_ind['index'], mrv=1)
        final_ind.columns = ['Final Value']
        
        return final_ind
    
    # Otherwise go back number of years specified
    
    else:
        
        print(f"""Data for '{indicator}' does not meet the coverage threshold of {coverage_threshold*100}% in WB Database {database}.
        Extracting data from previous years.""")
        
        # Get Data
        
        
        multiyear_df = wb.data.DataFrame(final_ind['index'], mrv=years_to_check)
        
        # Loop through DF in reverse order
        
        current_year = int(multiyear_df.columns[-1][2:])
        all_years = list(range(current_year, current_year - years_to_check, -1)) 
        
        for i, year in enumerate(all_years):
            year_column = f'YR{year}'
            
            # Skip years that don't have a corresponding column in the DataFrame
            
            if year_column not in multiyear_df.columns:
                continue 
                
            # For the first year, initialize 'Final_Value' with its values
            
            if i == 0:
                multiyear_df['Final_Value'] = multiyear_df[year_column]
                
            # Fill missing values in 'Final_Value' with the current year's data
 
            else:
                multiyear_df['Final_Value'] = multiyear_df['Final_Value'].fillna(multiyear_df[year_column])
            
            # Check data completeness for 'Final_Value' after potential filling
            
            data_coverage = multiyear_df['Final_Value'].notna().mean()
            if data_coverage >= coverage_threshold:
                print(f"""Achieved {round(data_coverage,4)*100}% data coverage by going back to data from {year},
                exceeding minimum threshold of {coverage_threshold*100}%. Returning this dataframe.\n""")
                break
                
        # Return Final DF
                
        if data_coverage < coverage_threshold:
            
            print(f"""Data coverage at {round(data_coverage,4)*100}% after going back {years_to_check} years.
            Failed to exceed minimum threshold of {coverage_threshold*100}%. Returning best dataframe anyway.\n""")
            
            
        return multiyear_df[['Final_Value']]

def indicator_returner(query, dimension = 'dim', indicator = 'ind', specify = False):
    
    df = wb_data_completer(query, specify = specify)
    df.columns = [f'ind_{dimension}{indicator}']
    return df


## GeoProcessing

## Data Processing

In [8]:
def scale_and_weight(merged_df, weight_list, return_nulls = False):
    
    # Scale
    
    scaler = MinMaxScaler()
    scaled_df = scaler.fit_transform(merged_df)
    scaled_df = pd.DataFrame(scaled_df, columns=scaler.get_feature_names_out()).sub(0.5)
    scaled_df.index = merged_df.index
    
    # Weight
    
    keys = scaled_df.columns
    
    weights = dict(zip(keys, weight_list))

    weighted_df = pd.DataFrame()

    for column, weight in weights.items():
        weighted_df[column] = scaled_df[column] * weight
        
    # Weighted mean
    
    weighted_df['weighted_mean'] = weighted_df.mean(axis=1)
    weighted_df['weighted_mean'] = weighted_df.apply(lambda row: np.nan if row[keys].isnull().sum() > 2 else row['weighted_mean'], axis=1)
    weighted_df = weighted_df.sort_values('weighted_mean', ascending=False)
    
    # Adding country names as index
    
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    listylist = list(weighted_df.columns)
    listylist.append('name')
    final_df = weighted_df.merge(world, left_index=True, right_on='iso_a3')[listylist].set_index('name')
    
    if return_nulls:
        return final_df
    
    return final_df[final_df['weighted_mean'].notnull()]



## Data Pipeline

In [47]:
indicator_to_df('Intentional homicides')

Unnamed: 0,index,SeriesName
0,VC.IHR.PSRC.FE.P5,"Intentional homicides, female (per 100,000 fem..."
1,VC.IHR.PSRC.MA.P5,"Intentional homicides, male (per 100,000 male)"
2,VC.IHR.PSRC.P5,"Intentional homicides (per 100,000 people)"


In [43]:
def dim_X_complete(indicator_dictionary, dimension, manual_data):
    
    dimension_dict = indicator_dictionary[dimension]
    dimension_dict = {k: v for k, v in dimension_dict.items() if v}
    
    print(f"""\n\n
    --------------------------Dimension {dimension}------------------------------\n
    """)
    
    ### Loading Data ###
    
    print(color.BOLD + "Loading Data.....\n" + color.END)
    
    full_df = None
    
    for ind_num, ind_value in dimension_dict.items():
        
        if ind_value[0] == 'WB':
            
            if len(ind_value) == 3:
            
                ind_x = indicator_returner(ind_value[1], f"{dimension}", f"{ind_num}")
            
            else:
                
                ind_x = indicator_returner(ind_value[1], f"{dimension}", f"{ind_num}", specify= ind_value[-1])
                
            
        if ind_value[0] == 'V-DEM':
            
            rel_df = manual_data[ind_value[0]]
            
            ind_x = rel_df[rel_df['year'] == 2022][['country_text_id', 'v2regsupgroupssize']].set_index('country_text_id')
            ind_x.columns = [f'ind_{dimension}{ind_num}']
            
            
        if ind_value[0] == 'OWiD':
            
            rel_df = manual_data[ind_value[0]]
            
            if ind_value[1] ==  'Public Administration Index':
               
                ind_x = rel_df[ind_value[1]]
                ind_x = ind_x[ind_x['Year'] == 2022].set_index('Code')[['public_admin_vdem_owid']]
                ind_x.columns = [f'ind_{dimension}{ind_num}']
               
            if ind_value[1] ==  'Freedom of Expression Index':
                ind_x = rel_df[ind_value[1]]
                ind_x = ind_x[ind_x['Year'] == 2022].set_index('Code')[['freeexpr_vdem_owid']]
                ind_x.columns = [f'ind_{dimension}{ind_num}']
                
            if ind_value[1] ==  'State Control over Territory':
                ind_x = rel_df[ind_value[1]]
                ind_x = ind_x[ind_x['Year'] == 2022].set_index('Code')[['terr_contr_vdem_owid']]
                ind_x.columns = [f'ind_{dimension}{ind_num}']
                
                
        if ind_value[0] == 'ACLED':
            
            rel_df = manual_data[ind_value[0]]
            rel_df_2 = manual_data['iso_list']
            
            if ind_value[1] ==  'Protest Count':
                
                grouped_df = rel_df[rel_df['event_type'] == 'Protests'][['country', 'year']].groupby(by = 'country')\
                    .agg({'year': 'count'})
                ind_x = grouped_df.merge(rel_df_2, left_index=True, right_on= 'name').set_index('alpha-3')[['year']]
                ind_x.columns = [f'ind_{dimension}{ind_num}']
                
            if ind_value[1] ==  'Battle Related Fatalities':
                
                grouped_df = rel_df[rel_df['event_type'].isin(['Explosions/Remote violence', 'Battles'])]\
                    [['country', 'fatalities']].groupby(by = 'country').agg({'fatalities': 'sum'})
                ind_x = grouped_df.merge(rel_df_2, left_index=True, right_on= 'name', how = 'right').set_index('alpha-3')[['fatalities']]
                ind_x['fatalities'] = ind_x['fatalities'].fillna(0)
                ind_x.columns = [f'ind_{dimension}{ind_num}']
                
            if ind_value[1] ==  'Violence in Neighbouring States':
                
                grouped_df = rel_df.groupby(by = 'country').sum()[['fatalities']]
                geo_df = manual_data['geodata']
                merged_geo = geo_df.merge(grouped_df, left_on='country_border_name', right_index=True, how = 'left')
                merged_grouped = merged_geo.groupby('country_name').sum()
                ind_x = merged_grouped.merge(rel_df_2, left_index=True, right_on= 'name', how = 'right').set_index('alpha-3')[['fatalities']]
                ind_x.columns = [f'ind_{dimension}{ind_num}']
                

        if ind_value[0] == 'ND':
            
            rel_df = manual_data[ind_value[0]]
            ind_x = rel_df.set_index('ISO3')[['2021']]
            ind_x.columns = [f'ind_{dimension}{ind_num}']
            
            
        if ind_value[0] == 'UNHCR':
            
            rel_df = manual_data[ind_value[0]]
            ind_x = rel_df.set_index("Country of asylum (ISO)")[["Refugees under UNHCR's mandate"]]
            ind_x.columns = [f'ind_{dimension}{ind_num}']
            
        
        print(f'''Successfully loaded Indicator {dimension}{ind_num} from {ind_value[0]} Database\n''')
               
        ### Merging DF ###
               
        if not isinstance(full_df, pd.DataFrame):
            
            full_df = ind_x
            
        else:
            
            full_df = full_df.merge(ind_x, left_index = True, right_index = True, how = 'left')
            
        
    
    
    ### Scaling and Weighting ###
    
    print (color.BOLD + "Scaling & Weighting Data...." + color.END)
    
    weight_list = []
    for values in dimension_dict.values():
        weight_list.append(values[2])
        
    full_df = scale_and_weight(full_df, weight_list)
    
    print(f"""\n**Successfully loaded , merged, scaled, and weighted Dimension {dimension} Data**\n""")
    print(color.BOLD  + """Overall Data Coverage:
    """ + color.END)
    print(1- full_df.isna().sum()/len(full_df))

    print("""\n\n
    ---------------------------------------------------------------------------\n\n
    
    """)
            
            
    return full_df
    

# Fragility Definition

In [39]:
indicator_dictionary = {
    'G': {
        1:['V-DEM', 'Size of Regime Support Group', 3],
        2:['OWiD', 'Public Administration Index', 2],
        3:['WB', 'Control of Corruption: Estimate', 2],
        4:['WB', 'Rule of Law: Estimate', 2],
        5:['WB', 'Tax Revenue', 2, slice(1,2)],
        6:['WB', 'Proportion of Seats Held by Women', 1],
        7:['OWiD', 'Freedom of Expression Index', 1]
    },
    'S': {
        1:['WB', 'Gini Index', -3],
        2:['WB', 'Inflation, Consumer Prices', -2],
        3:['WB', 'Unemployment, Total', -2, slice(1,2)],
        4:['WB', 'Women Business and the Law Index', 2],
        5:['ACLED', 'Protest Count', -2],
        6:['WB', 'Age Dependency Ratio', -1, slice(0,1)],
        7:['WB', 'Ease of Doing Business Score', 1]
    },
    'I': {
        1:['WB', 'GDP per Capita', 3, slice(0,1)],
        2:['WB', 'Poverty Gap at $2.15 a Day', -3],
        3:['WB', 'Human Capital Index', 2, slice(0,1)],
        4:['WB', 'Women who Believe a Husband is Justified in Beating his Wife', -2, slice(4,5)],
        5:['WB', 'Current Health Expenditure per Capita, PPP', 2],
        6:[],
        7:[]
    },
    'C': {
        1:['ACLED', 'Battle Related Fatalities', -3],
        2:[],
        3:['OWiD', 'State Control over Territory', 2],
        4:['WB', 'Intentional homicides', -2, slice(2,3)],
        5:[],
        6:[],
        7:[]
    },
    'E': {
        1:['ND', 'GAIN Index', 3],
        2:[],
        3:[],
        4:[],
        5:[],
        6:[],
        7:[]
    },
    'R': {
        1:['ACLED', 'Violence in Neighbouring States', -3],
        2:['UNHCR', 'Refugee In-Flow', -2],
        3:[],
        4:[],
        5:[],
        6:[],
        7:[]
    }
    
}


# Display

In [None]:
wb.db =2
pd.set_option('max_colwidth', 400)
indicator_to_df('Financial')

In [44]:
s_df = dim_X_complete(indicator_dictionary, "R", manual_data)




    --------------------------Dimension R------------------------------

    
[1mLoading Data.....
[0m
Successfully loaded Indicator R1 from ACLED Database

Successfully loaded Indicator R2 from UNHCR Database

[1mScaling & Weighting Data....[0m

**Successfully loaded , merged, scaled, and weighted Dimension R Data**

[1mOverall Data Coverage:
    [0m
ind_R1           0.959538
ind_R2           0.884393
weighted_mean    1.000000
dtype: float64



    ---------------------------------------------------------------------------


    
    


In [46]:
s_df.sort_values('weighted_mean')

Unnamed: 0_level_0,ind_R1,ind_R2,weighted_mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Russia,-1.500000,0.283868,-0.608066
Poland,-1.444998,0.455685,-0.494657
Romania,-1.445759,0.940800,-0.252480
Slovakia,-1.444332,0.945877,-0.249227
Hungary,-1.445759,0.980175,-0.232792
...,...,...,...
Fr. S. Antarctic Lands,1.500000,,1.500000
Puerto Rico,1.500000,,1.500000
Greenland,1.500000,,1.500000
Falkland Is.,1.500000,,1.500000


In [None]:
def display_tables_and_maps(dimensions, indicators = False, table = True, geomap = True):
    
    for dimension in dimensions:
        

In [None]:
indicator_dictionary.keys()

In [None]:
# def display_all_dimensions(indicator_dictionary, manual_data):

#     world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# `
#     for key in indicator_dictionary.keys():

#         dimension_df = dim_X_complete(indicator_dictionary, key, manual_data)
#         geo_merge = world.merge(dimension_df, left_on = 'name', right_index = True)

#         geo_merge.plot(column='weighted_mean', cmap='RdYlGn', missing_kwds={'color': 'black'}, figsize=(10, 8))
#         plt.title(f'Map for Dimension {key}')
#         plt.show()

In [None]:
def map_individual_indicators(dimension_df):
    
    just_ind_df = dimension_df.drop(columns='weighted_mean')

    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    geo_merge = world.merge(just_ind_df, left_on = 'name', right_index = True)

    nrows, ncols = 4, 2

    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 10))

    axes = axes.flatten()

    for idx, column in enumerate(just_ind_df.columns):
        geo_merge.plot(column=column, cmap='RdYlGn', missing_kwds={'color': 'black'}, ax=axes[idx])
        axes[idx].set_title(f'Map for {column}')
        axes[idx].axis('off')

In [None]:
map_individual_indicators(dim_X_complete(indicator_dictionary, 'S', manual_data))