In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
BASE_DIR1 = '/content/drive/My Drive/Mali'

RANDOM_SEED = 7 # for reproducibility
RESULTS_DIR = os.path.join(BASE_DIR1, 'results')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
sys.path.append(BASE_DIR1)
from utils import merge_on_lat_lon, assign_groups, run_randomized_cv, run_spatial_cv

In [None]:
def process_mali():
    np.random.seed(RANDOM_SEED)
    lsms_dir = os.path.join(BASE_DIR1, 'mali_2015', 'LSMS')
    consumption_file = 'aci2015_agregatconso.csv'
    consumption_ph_col = 'pcexp' # per household
    hhsize_col = 'hhsize' # people in household

    geovariables_file = 'eaci_geovariables_2015.csv'
    lat_col = 'lat_dd_mod'
    lon_col = 'lon_dd_mod'

    # purchasing power parity for malawi in 2016 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=MLI)
    ppp = 214.013
    
    df_geo = pd.read_csv(os.path.join(lsms_dir, geovariables_file))
    df_hhf = pd.read_csv(os.path.join(lsms_dir, 'hh_mod_f.csv'))
    df_plot = pd.read_csv(os.path.join(lsms_dir, 'plotgeovariablesihs4.csv'))
    df_com = pd.read_csv(os.path.join(lsms_dir, 'com_cd.csv'))
    df_com2 = pd.read_csv(os.path.join(lsms_dir, 'com_cf1.csv'))
    df_tie = pd.read_csv(os.path.join(lsms_dir, consumption_file))[['case_id', 'ea_id']]

    hhf_input = df_hhf[['case_id', 'hh_f10', 'hh_f08']]
    com_input = df_com[['ea_id', 'com_cd01', 'com_cd16', 'com_cd18a', 'com_cd20a', 'com_cd22a', 'com_cd24a',
                       'com_cd27a', 'com_cd36a', 'com_cd40a', 'com_cd49a', 'com_cd51a', 'com_cd60a', 'com_cd67a',
                       'com_cd69a']]

    com2_input = df_com2[['ea_id', 'com_cf08a']]

    geo_input = df_geo[['case_id', 'dist_admarc', 'dist_agmrkt', 'dist_auction', 'dist_boma', 'dist_borderpost',
                      'dist_popcenter', 'dist_road', 'af_bio_1', 'af_bio_8', 'af_bio_12', 'af_bio_13', 'af_bio_16', 
                       'lat_modified', 'lon_modified']]
    geo_input.rename(columns={'lat_modified': 'cluster_lat', 'lon_modified': 'cluster_lon'}, inplace=True)
    geo_input.dropna(inplace=True)

    plot_input = df_plot[['case_id', 'dist_hh']]
    
    df_cons = pd.read_csv(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'processed', 'clusters.csv'))
    df_merge = merge_on_lat_lon(df_cons, geo_input)
    df_merge = pd.merge(df_merge, hhf_input, on='case_id', how='left')
    df_merge = pd.merge(df_merge, df_tie, on='case_id', how='left')
    df_merge = pd.merge(df_merge, com_input, on='ea_id', how='left')
    df_merge = pd.merge(df_merge, com2_input, on='ea_id', how='left')
    df_merge = pd.merge(df_merge, plot_input, on='case_id', how='left')
    return df_merge.drop(['case_id', 'ea_id'], axis=1)