In [1]:
#https://github.com/nealjean/predicting-poverty/blob/master/scripts/ProcessSurveyData.R
#https://github.com/nealjean/predicting-poverty/blob/master/scripts/Fig1.R

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gdal
import sys
sys.path.append("../Code")
sys.path.append("../Src")
from img_lib import RasterGrid

# Prepare LSMS Data from Uganda 2011-2012

## Load and Merge LSMS DataFiles

In [3]:
# Data Source : http://microdata.worldbank.org/index.php/catalog/2059/datafile/F70

In [4]:
#hh_data_2013=pd.read_csv("../Data/HH/WB/UGA_2013_UNPS_v01_M_CSV/agsec1.csv")

In [2]:
hh_data_2011_cons=pd.read_stata("../Data/HH/WB/UGA_2011_UNPS_v01_M_STATA/UNPS 2011-12 Consumption Aggregate.dta")[["HHID","welfare","poor"]]
hh_data_2011_coords=pd.read_stata("../Data/HH/WB/UGA_2011_UNPS_v01_M_STATA/UNPS_Geovars_1112.dta")[["HHID","lat_mod","lon_mod","urban"]]
hh_data_2011_weights=pd.read_stata("../Data/HH/WB/UGA_2011_UNPS_v01_M_STATA/GSEC1.dta")[["HHID","comm","mult"]]
hh_data_2011_assets=pd.read_stata("../Data/HH/WB/UGA_2011_UNPS_v01_M_STATA/GSEC9A.dta")[["HHID","h9q3","h9q4"]]

In [3]:
hh_data_2011=hh_data_2011_cons.merge(hh_data_2011_coords).merge(hh_data_2011_weights).merge(hh_data_2011_assets)

In [4]:
hh_data_2011.head()

Unnamed: 0,HHID,welfare,poor,lat_mod,lon_mod,urban,comm,mult,h9q3,h9q4
0,1013000201,134629.4375,0.0,-0.530627,32.327492,Rural,10130002,1329.515991,2.0,Iron sheets
1,1013000202,27919.814453,1.0,,,Rural,10130002,7844.594727,1.0,Iron sheets
2,1013000204,39011.269531,0.0,-0.530627,32.327492,Rural,10130002,2659.031982,1.0,Iron sheets
3,1013000206,66898.429688,0.0,0.289081,32.56065,Rural,10130002,891.622498,1.0,Iron sheets
4,1013000210,112471.296875,0.0,-0.530627,32.327492,Rural,10130002,1624.854736,1.0,Iron sheets


## Normalize the consumption variable 

In [7]:
# ProcessSurveyData.R : data.frame(hhid = HHID, cons = welfare*118.69/(30*946.89*mean(c(66.68, 71.55))))

In [8]:
hh_data_2011["cons"] = hh_data_2011["welfare"]*118.69/(30*946.89*np.mean([66.68, 71.55])) # elucidate

## Aggregate by Cluster 

In [9]:
len(hh_data_2011["lat_mod"].unique()) 

644

In [10]:
len(hh_data_2011["comm"].unique()) #The comm variable has less unique locations 

320

In [11]:
hh_data_2011_cluster=hh_data_2011.groupby(["lat_mod","lon_mod"])[["cons","poor"]].agg(["mean","count"]).reset_index()

In [12]:
hh_data_2011_cluster.columns=["gpsLatitude","gpsLongitude","cons","n","poor","n2"]

### Filter clusters with less than 3 HHs interviewed 

In [13]:
hh_data_2011_cluster_minHH=hh_data_2011_cluster[hh_data_2011_cluster["n"]>2]

In [14]:
hh_data_2011_cluster_minHH["poor_majority"]= np.where(hh_data_2011_cluster_minHH['poor']>=0.5, 1, 0)
hh_data_2011_cluster["poor_majority"]= np.where(hh_data_2011_cluster['poor']>=0.5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Assign GRID coordinates to each cluster to match with Satellite imageries

In [15]:
GRID = RasterGrid()

In [16]:
def get_lon_idx(lon):
    lon_idx = np.where(GRID.top_left_x_coords < lon)[0][-1]
    return lon_idx
def get_lat_idx(lat):
    lat_idx = np.where(GRID.top_left_y_coords > lat)[0][-1]
    return lat_idx

In [17]:
hh_data_2011_cluster_minHH["i"]=hh_data_2011_cluster_minHH["gpsLongitude"].apply(get_lon_idx)
hh_data_2011_cluster_minHH["j"]=hh_data_2011_cluster_minHH["gpsLatitude"].apply(get_lat_idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
hh_data_2011_cluster_minHH.to_csv("../Data/Intermediate_files/hh_data_2011_cluster_minHH.csv")