### Notation:
- SAL- small area
- PP- police precinct
- AEA- Albers Equal Area Conic
- CPS- crime per SAL 

In [1]:
from random import shuffle, randint
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Polygon, Point, MultiPoint, MultiPolygon, LineString, mapping, shape
from descartes import PolygonPatch
import random
import fiona
import numpy as np
import csv
from fiona import collection

import geopandas as gpd
from geopandas.tools import sjoin # rtree index in-build, used with inner, intersection
import pandas as pd

from collections import defaultdict

In [None]:
def find_intersections(o):
    
    from collections import defaultdict

    paired_ind = [o.pp_index, o.sal_index]

    d_over_ind = defaultdict(list)

    # creating a dictionary that has prescints as keys and associated small areas as values
    for i in range(len(paired_ind[0].values)):
        if not paired_ind[0].values[i]==paired_ind[1].values[i]: # it shows itself as intersection
            d_over_ind[paired_ind[0].values[i]].append(paired_ind[1].values[i])

    # get rid of the pol precincts with no small areas associated to them- not the most efficient way
    d_temp = {}
    for l in d_over_ind:
        if len(d_over_ind[l]):
            d_temp[l] = d_over_ind[l]

    return d_temp
    
    
def calculate_join_indices(g1_reind, g2_reind):

        # A: region of the police data with criminal record
        # C: small area with population data
        # we look for all small areas intersecting a given C_i, calculate the fraction of inclusion, scale the
        # population accordingly: area(A_j, where A_j crosses C_i)/area(A_j)* popul(A_j)
        
    
        # the actual indexing:
        out = sjoin(g1_reind, g2_reind, how ="inner", op = "intersects")
        
        out.drop('index_right', axis=1, inplace=True) # there is a double index fo smal areas, so we drop one
        #out_sorted = out.sort(columns='polPrecincts_index', ascending=True) # guess sorting is not necessary, cause we are
        # using doctionaries at later stages
        #dict_over_ind = find_intersections(out_sorted)

        # output retains only 1 area (left or right join), and gives no intersection area.
        # so we create an array with paired indices: police precincts with associated small areas
        # we use it in a loop in a function below
        dict_over_ind = find_intersections(out) 
        
        return dict_over_ind
    
def calculate_inclusion_indices(g1_reind, g2_reind):

        out = sjoin(g1_reind, g2_reind, op = "contains") ## PP contains SAL
        
        out.drop('index_right', axis=1, inplace=True) 
        
        dict_over_ind = find_intersections(out) 
        
        return dict_over_ind
    
def calculate_join(dict_over_ind, g1_reind, g2_reind):
        area_total = 0
        data_aggreg = []

        # note to self: make sure to import shapely Polygon
        for index1, crim in g1_reind.iterrows():
            try:
                index1 = crim.pp_index
                sals_found = dict_over_ind[index1]

                for sal in range(len(sals_found)):
                    pom = g2_reind[g2_reind.sal_index == sals_found[sal]]['geometry']        

                    #if pom.intersects(crim['geometry']).values[0]:
                    area_int = pom.intersection(crim['geometry']).area.values[0]
                    if area_int>0:
                        area_total += area_int 
                        area_crim = crim['geometry'].area

                        area_popu = pom.values[0].area

                        popu_count = g2_reind[g2_reind.sal_index == sals_found[sal]]['PPL_CNT'].values[0]
                        murd_count = crim['murd_cnt']
                        pol_province = crim['province']
                        popu_frac = (area_int / area_popu) * popu_count# fraction of the pop area contained inside the crim
                        #print(popu_frac)
                        extra_info_col_names = ['DC_NAME','MN_NAME','MP_NAME','PR_NAME','SP_NAME']
                        
                        extra_info_col_codes = ['MN_CODE','MP_CODE','PR_CODE','SAL_CODE','SP_CODE']

                        extra_names = g2_reind[g2_reind.sal_index == sals_found[sal]][extra_info_col_names]#.filter(regex=("NAME"))
                        extra_codes = g2_reind[g2_reind.sal_index == sals_found[sal]][extra_info_col_codes]#.filter(regex=("NAME"))

                        data_aggreg.append({'geometry': pom.intersection(crim['geometry']).values[0], 'id1': index1,\
                                      'id2': sals_found[sal] ,'area_pp': area_crim,'area_sal': area_popu,\
                                  'area_inter': area_int, 'popu_inter' : popu_frac, 'popu_sal': popu_count,\
                                  'murd_cnt': murd_count,'province': pol_province,
                                  'DC_NAME': extra_names.DC_NAME.values[0],\
                                  'MN_NAME': extra_names.MN_NAME.values[0], 'MP_NAME': extra_names.MP_NAME.values[0],\
                                  'PR_NAME': extra_names.PR_NAME.values[0],'SP_NAME': extra_names.SP_NAME.values[0],\
                                  'MN_CODE': extra_codes.MN_CODE.values[0],'MP_CODE': extra_codes.MP_CODE.values[0],\
                                  'PR_CODE': extra_codes.PR_CODE.values[0],'SAL_CODE': extra_codes.SAL_CODE.values[0],\
                                  'SP_CODE': extra_codes.SP_CODE.values[0]} )
            except:
                pass
            
        df_t = gpd.GeoDataFrame(data_aggreg,columns=['geometry', 'id1','id2','area_pp',\
                                       'area_sal','area_inter', 'popu_inter',\
                                       'popu_sal', 'murd_cnt','province','DC_NAME',\
                                       'MN_NAME','MP_NAME','PR_NAME','SP_NAME',\
                                      'MN_CODE','MP_CODE','PR_CODE','SAL_CODE','SP_CODE'])
        #df_t.to_file(out_name)
        return df_t, area_total, data_aggreg

In [None]:
# this function adds the remaining columns, calculates fractions etc
def compute_final_col(df_temp):
    # add population data per police percinct to the main table
    # id1- PP, id2 - SAL
    temp = df_temp.groupby(by=['id1'])['popu_inter'].sum().reset_index()

    data_with_population = pd.merge(df_temp, temp, on='id1', how='outer')\
            .rename(columns={'popu_inter_y':'popu_frac_per_pp', 'popu_inter_x':'popu_inter'})

    # finally, update the murder rate per SAL : id2 is sal's id    

    data_with_population['murd_est_per_int'] = data_with_population['popu_inter']/data_with_population['popu_frac_per_pp']\
               * data_with_population['murd_cnt']
    data_mur_per_int = data_with_population.groupby(by=['id2'])['murd_est_per_int'].sum().reset_index()

    data_mur_per_sal = data_mur_per_int.rename(columns={'murd_est_per_int':'murd_est_per_sal'})

    data_with_population['ratio_per_int'] = data_with_population['popu_inter']/data_with_population['popu_frac_per_pp']\

    data_complete = pd.merge(data_with_population, data_mur_per_sal, on='id2', how='outer')\
            .rename(columns={'id1':'index_PP', 'id2':'index_SAL'})
    return data_complete


Main functions to find intersection. Files loaded in are the AEA projected shapefiles.

In [124]:
salSHP_upd = 'shapefiles/updated/sal_population_aea.shp'
polSHP_upd = 'shapefiles/updated/polPrec_murd2015_prov_aea.shp'

geo_pol = gpd.GeoDataFrame.from_file(polSHP_upd)
geo_sal = gpd.GeoDataFrame.from_file(salSHP_upd)

geo_pol_reind = geo_pol.reset_index().rename(columns={'index':'pp_index'})
geo_sal_reind = geo_sal.reset_index().rename(columns={'index':'sal_index'})

#dict_int = calculate_join_indices(geo_pol_reind,geo_sal_reind)

test on a subset:

In [None]:
gt1= geo_pol_reind[geo_pol.province=="Free State"].head(n=2)
gt2 = geo_sal_reind[geo_sal_reind.PR_NAME=="Free State"].reset_index()
d = calculate_join_indices(gt1, gt2)

Running the intersections on pre-computed indices:

In [None]:
from timeit import default_timer as timer

#start = timer() 

#df_inc, sum_area_inc, data_inc = calculate_join(dict_inc, geo_pol_reind, geo_sal_reind)
#end = timer()
#print("1st", end - start)  

start = timer() 
df_int, sum_area_int, data_int = calculate_join(dict_int, geo_pol_reind, geo_sal_reind)
end = timer()
print("2nd", end - start)  

find pol precincts within WC boundary

In [None]:
za_province = gpd.read_file('za-provinces.topojson',driver='GeoJSON')#.set_index('id')
za_province.crs={'init': '27700'}

wc_boundary = za_province.ix[8].geometry # WC
#pp_WC = geo_pol[geo_pol.geometry.within(wc_boundary)]
pp_WC_in = geo_pol[geo_pol.geometry.intersects(wc_boundary)]
#.unary_union, sal_wc_union_bound = sal_WC_in.unary_union
pp_WC_overlaps = pp_WC_in[pp_WC_in.province!="Western Cape"]
pp_WC_pol_annot = pp_WC_in[pp_WC_in.province=="Western Cape"]

In [None]:
#pp_test = pp_WC_in[pp_WC_in['compnt_nm'].isin(['atlantis','philadelphia','kraaifontein','brackenfell','kuilsriver','kleinvleveerste river','macassar','somerset west','fish hoek'])]
#pp_test = pp_WC_in[pp_WC_in['compnt_nm'].isin(['beaufort west','doring bay','murraysburg', 'strandfontein','nuwerus','lutzville'])]
%matplotlib inline
#pp_WC_overlaps.plot()

Adding final columns:

In [None]:
# There are 101,546 intersections 
df_int_aea = compute_final_col(df_int) # add final calculations
df_int_aea.to_csv('data/pp_int_intersections2.csv')

Some intersections are multipolygons (PP and SAL intersect in multiple areas):

In [None]:
df_int_aea.head(n=3).values[2][0]

There are curious cases of intersections, which form polygons. For example,a Free State police precinct 'dewetsdorp' with murder count of 1 (yet high rate of Stock-theft- 52 in 2014) intersects the SAL 4990011 (part of SP Mangaung NU) in two lines:

In [123]:
geo_sal_reind[geo_sal_reind.sal_index==28532].geometry.values[0]

NameError: name 'geo_sal_reind' is not defined

In [None]:
geo_pol_reind[geo_pol_reind.pp_index ==358].geometry.values[0]

In [None]:
a = geo_pol_reind[geo_pol_reind.pp_index ==358].geometry.values[0]
b= geo_sal_reind[geo_sal_reind.sal_index==28532].geometry.values[0]
c = [geo_pol_reind[geo_pol_reind.pp_index ==358].geometry.values[0],geo_sal_reind[geo_sal_reind.sal_index==28532].geometry.values[0]]
cascaded_union(c)


In [None]:
from shapely.ops import cascaded_union
cascaded_union(b)

In [None]:
geo_sal_reind[geo_sal_reind.sal_index==28532]

In [None]:
df_int_aea.to_file('data/pp_int_intersections.shp')

In [2]:
# When reading from a file"

import pandas as pd
df_int_aea = pd.read_csv('data/pp_int_intersections.csv')   

# when reading from file a column Unnamed is added. Needs to be removed.
cols = [c for c in df_int_aea.columns if c.lower()[:7] != 'unnamed']

df_int_aea=df_int_aea[cols]

In [3]:
df_int_aea.head(n=2)

Unnamed: 0,geometry,index_PP,index_SAL,area_pp,area_sal,area_inter,popu_inter,popu_sal,murd_cnt,province,...,SP_NAME,MN_CODE,MP_CODE,PR_CODE,SAL_CODE,SP_CODE,popu_frac_per_pp,murd_est_per_int,ratio_per_int,murd_est_per_sal
0,MULTIPOLYGON (((179538.273068833 -3355106.7785...,0,28532,55396370.0,198138700.0,17779.878339,0.00323,36,25,Free State,...,Mangaung NU,499,499002,4,4990011,499002001,81883.390378,9.862935e-07,3.945174e-08,0.011461
1,POLYGON ((176023.3021776396 -3358404.039455506...,26,28532,280920400.0,198138700.0,3081.161637,0.00056,36,28,Free State,...,Mangaung NU,499,499002,4,4990011,499002001,99917.86418,1.568782e-07,5.602792e-09,0.011461


In [4]:
data_prov = df_int_aea[['PR_NAME','province','murd_est_per_int']]
data_prov.groupby('province')['murd_est_per_int'].sum()

province
0                   0
Eastern Cape     3051
Free State        943
Gauteng          3671
Kwazulu/Natal    3759
Limpopo           777
Mpumalanga        831
North West        853
Northern Cape     411
Western Cape     3186
Name: murd_est_per_int, dtype: float64

In [None]:
data_prov.groupby('PR_NAME')['murd_est_per_int'].sum()

In [None]:
# check over small areas- sum of all the crimes should be 17482
pom = {}
for ind, row in df_inc_aea.iterrows():
    pom[row['index_SAL']] = row['murd_est_per_sal'] 
s=0
for key in pom:
    s = s + pom[key]
print(s)

## measuring the error of the 'CPS' estimate
Computing the lower (LB) and upper bounds (UB), wherever possible, is done the following way:
UB: based the calcualation of population per PP on all SALs included entirely within PP. If not possible, set to NaN
LB: find all SALs intersecting a given PP, but base the PP population estimation on the population of the entire SAL, not the population of the intersection.

As a result, each intersection will have a triplet of values associated to it: (LB, actual estimate, UB/NaN). The bounds are not additive- that is, the estimates applies only to the level of SAL area, and will not be maintained when summed over, e.g. SP or MN

For modyfying/selecting entries for bound estimation, we discard the last 4 columns 
with precomputed values

In [4]:
df_int=df_int_aea.ix[:,:20]

In [5]:
# this function adds the remaining columns, calculates fractions etc
def compute_final_col_bounds(df_aea):

    #recalculate pop frac per PP
    temp = df_aea.groupby(by=['index_PP'])['popu_inter'].sum().reset_index()
    data_with_population = pd.merge(df_aea, temp, on='index_PP', how='outer')\
            .rename(columns={'popu_inter_y':'popu_frac_per_pp', 'popu_inter_x':'popu_inter'})

    data_with_population['murd_est_per_int'] = data_with_population['popu_inter']/data_with_population['popu_frac_per_pp']\
               * data_with_population['murd_cnt']
    
    data_mur_per_int = data_with_population.groupby(by=['index_SAL'])['murd_est_per_int'].sum().reset_index()

    data_mur_per_sal = data_mur_per_int.rename(columns={'murd_est_per_int':'murd_est_per_sal'})

    data_with_population['ratio_per_int'] = data_with_population['popu_inter']/data_with_population['popu_frac_per_pp']\

    data_complete = pd.merge(data_with_population, data_mur_per_sal, on='index_SAL', how='outer')
    #\        .rename(columns={'id1':'index_PP', 'id2':'index_SAL'})
    return data_complete

create new tables for the LB and UB

In [6]:
list_lb =[]
list_ub = []
for i,entry in df_int.iterrows():#f_inc_aea:
    if (entry.area_inter/entry.area_sal==1): # select those included 'completely'
        list_ub.append(entry)
        
    entry.popu_inter = entry.popu_sal # this is actually already true for the above if() case
    list_lb.append(entry)
        
df_int_aea_ub_p=gpd.GeoDataFrame(list_ub)
df_int_aea_lb_p=gpd.GeoDataFrame(list_lb)


In [8]:
df_int_aea_lb = compute_final_col_bounds(df_int_aea_lb_p)\
      .rename(columns={'murd_est_per_int':'murd_est_per_int_lb',\
      'ratio_per_int':'ratio_per_int_lb','murd_est_per_sal':'murd_est_per_sal_lb'})
# complete
df_int_aea_ub = compute_final_col_bounds(df_int_aea_ub_p)\
      .rename(columns={'murd_est_per_int':'murd_est_per_int_ub',\
      'ratio_per_int':'ratio_per_int_ub','murd_est_per_sal':'murd_est_per_sal_ub'})

In [9]:
#check if numbers add up per province level (invariant for inclusion):
data_prov = df_int_aea_ub[['PR_NAME','province','murd_est_per_int_ub']]
data_prov.groupby('province')['murd_est_per_int_ub'].sum()

province
0                   0
Eastern Cape     3047
Free State        940
Gauteng          3670
Kwazulu/Natal    3758
Limpopo           777
Mpumalanga        830
North West        846
Northern Cape     410
Western Cape     3186
Name: murd_est_per_int_ub, dtype: float64

In [9]:
temp_ub = df_int_aea_ub.groupby(by=['SP_CODE'])['murd_est_per_int_ub'].sum().reset_index()
temp_lb = df_int_aea_lb.groupby(by=['SP_CODE'])['murd_est_per_int_lb'].sum().reset_index()
temp_est = df_int_aea.groupby(by=['SP_CODE'])['murd_est_per_int'].sum().reset_index()
temp = pd.merge(temp_lb, temp_est, on='SP_CODE', how='outer')
df_bounds = pd.merge(temp, temp_ub, on='SP_CODE', how='outer')

At the level of SP (and probably others) some bounds are inverted... UB < LB (2,242 out of 21,589)

In [11]:
#mn_bounds_def = mn_bounds[~mn_bounds.UB_murder.isnull()]
df_inv_bounds = df_bounds[df_bounds.murd_est_per_int_ub<df_bounds.murd_est_per_int_lb]

In [12]:
df_inv_bounds.tail()

Unnamed: 0,SP_CODE,murd_est_per_int_lb,murd_est_per_int,murd_est_per_int_ub
21524,987105001,0.868886,0.486138,0.157929
21573,987147001,1.319127,1.119528,0.709489
21577,987151001,0.829423,0.62239,0.444365
21584,987158001,0.288028,0.304164,0.260411
21586,987160001,0.118429,0.085994,0.046607


In [10]:
temp_ub = df_int_aea_ub.groupby(by=['SAL_CODE'])['murd_est_per_int_ub'].sum().reset_index()
temp_lb = df_int_aea_lb.groupby(by=['SAL_CODE'])['murd_est_per_int_lb'].sum().reset_index()
temp_est = df_int_aea.groupby(by=['SAL_CODE'])['murd_est_per_int'].sum().reset_index()

#       .rename(columns={'popu_inter_y':'popu_frac_per_pp', 'popu_inter_x':'popu_inter'})

In [62]:
temp = pd.merge(temp_lb, temp_est, on='SAL_CODE', how='outer')
df_bounds = pd.merge(temp, temp_ub, on='SAL_CODE', how='outer')

In [None]:
mn_names_set = set(df_int_aea_lb.MN_NAME)
mn_names = []
for s in mn_names_set:
    mn_names.append(s)

In [66]:
df_bounds.head(n=2)

Unnamed: 0,SAL_CODE,murd_est_per_int_lb,murd_est_per_int,murd_est_per_int_ub
0,1600001,2.549157,1.969173,
1,1600002,0.455872,0.586146,0.690684


In [76]:
df_bound_nonan = df_bounds[~df_bounds.murd_est_per_int_ub.isnull()&df_bounds.murd_est_per_int>0].sort(['murd_est_per_int'])

Plotting the lower and upper bounds:

In [27]:
import warnings
warnings.filterwarnings('ignore')
import mpld3
from mpld3 import plugins
from mpld3.utils import get_id
#import numpy as np
import collections

from mpld3 import enable_notebook
enable_notebook()

In [96]:
def make_labels_points(dataf):
    L = len(dataf)

    x = np.array(dataf['murd_est_per_int_lb'])                  
    y = np.array(dataf['murd_est_per_int_ub'])
    z = np.array(dataf['murd_est_per_int'])
    l = np.array(dataf['SAL_CODE'])  
    d = y-x # error
    
    s = "  "
    sc = ", err: "
    seq = []
    seqc = []

  
    t = [seq.append(s.join((str(l[i]), str(z[i])))) for i in range(L)]
    t = [seqc.append(sc.join((seq[i], str(d[i])))) for i in range(L)]

    return seqc, L

In [108]:
def make_scatter(dataf, outname, outtitle):
    l = np.array(dataf['SAL_CODE'])  
    x = np.array(dataf['murd_est_per_int_lb'])                  
    y = np.array(dataf['murd_est_per_int_ub'])
    z = np.array(dataf['murd_est_per_int'])
    d = y-x # error
    
    # build a rectangle in axes coords
    left, width = .15, .7
    bottom, height = .09, .75
    right = left + width
    top = bottom + height

    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
    N=len(dataf)
    scatter = ax.scatter(range(1,N+1),z,c=100*d,s=1000*d,alpha=0.3, cmap=plt.cm.jet, color='blue', label='...')
    ax.set_title(outtitle, size=15)
    
    seqc, L = make_labels_points(dataf)
    labels12 = ['(SAL id, est: {0}'.format(seqc[i]) for i in range(L)]

    tooltip = plugins.PointLabelTooltip(scatter, labels=labels12)
    plugins.connect(fig, tooltip)
    
    ax.set_xlabel('SAL')
    ax.set_ylabel('murder rate', labelpad = 20)
 
    html_str = mpld3.fig_to_html(fig)
    Html_file= open(outname,"w")
    Html_file.write(html_str)
    Html_file.close()

In [109]:
make_scatter(df_bound_nonan.head(n=8000), 'bounds.html', "SAL estimation bounds")

In [116]:
df_bound_nonan[df_bound_nonan.SAL_CODE==3760001]

Unnamed: 0,SAL_CODE,murd_est_per_int_lb,murd_est_per_int,murd_est_per_int_ub
23425,3760001,0.007108,0.033304,1


In [117]:
df_int_aea_ub[df_int_aea_ub.SAL_CODE==3760001]

Unnamed: 0,geometry,index_PP,index_SAL,area_pp,area_sal,area_inter,popu_inter,popu_sal,murd_cnt,province,...,SP_NAME,MN_CODE,MP_CODE,PR_CODE,SAL_CODE,SP_CODE,popu_frac_per_pp,murd_est_per_int_ub,ratio_per_int_ub,murd_est_per_sal_ub
34727,POLYGON ((-63778.88587880091 -3367458.14169631...,551,23678,2075843000.0,623949.221161,623949.221161,25,25,1,Northern Cape,...,Belmont SP,376,376007,3,3760001,376007001,25,1,1,1


In [118]:
df_int_aea_lb[df_int_aea_lb.SAL_CODE==3760001]

Unnamed: 0,geometry,index_PP,index_SAL,area_pp,area_sal,area_inter,popu_inter,popu_sal,murd_cnt,province,...,SP_NAME,MN_CODE,MP_CODE,PR_CODE,SAL_CODE,SP_CODE,popu_frac_per_pp,murd_est_per_int_lb,ratio_per_int_lb,murd_est_per_sal_lb
56059,POLYGON ((-63778.88587880091 -3367458.14169631...,551,23678,2075843000.0,623949.221161,623949.221161,25,25,1,Northern Cape,...,Belmont SP,376,376007,3,3760001,376007001,3517,0.007108,0.007108,0.007108


In [122]:
df_int_aea_lb[df_int_aea_lb.index_PP==551]

Unnamed: 0,geometry,index_PP,index_SAL,area_pp,area_sal,area_inter,popu_inter,popu_sal,murd_cnt,province,...,SP_NAME,MN_CODE,MP_CODE,PR_CODE,SAL_CODE,SP_CODE,popu_frac_per_pp,murd_est_per_int_lb,ratio_per_int_lb,murd_est_per_sal_lb
55778,MULTIPOLYGON (((-68566.15538511297 -3393601.23...,551,23554,2075843000.0,578896500.0,2520.749,609,609,1,Northern Cape,...,Thembelihle NU,374,374002,3,3740011,374002001,3517,0.173159,0.173159,0.427418
55794,POLYGON ((-66737.06515200126 -3393564.16996531...,551,24662,2075843000.0,190850700.0,37167.5,478,478,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600037,460004001,3517,0.135911,0.135911,0.600005
55796,POLYGON ((-63443.14436415431 -3387426.23596726...,551,24648,2075843000.0,314182400.0,47235.89,171,171,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600004,460004001,3517,0.048621,0.048621,0.048621
55799,POLYGON ((-44493.3621372392 -3351749.539519794...,551,23614,2075843000.0,1049865000.0,871206600.0,233,233,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760018,376002001,3517,0.06625,0.06625,0.202719
55802,POLYGON ((-59950.77325334126 -3380883.27168252...,551,24656,2075843000.0,551528100.0,158177.5,288,288,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600015,460004001,3517,0.081888,0.081888,0.268638
55806,POLYGON ((-47553.77612615489 -3357547.14320729...,551,24647,2075843000.0,352007000.0,36017.31,164,164,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600003,460004001,3517,0.046631,0.046631,0.15652
55832,POLYGON ((-110781.5374389185 -3372630.60518009...,551,23553,2075843000.0,2911161000.0,14887.74,329,329,1,Northern Cape,...,Thembelihle NU,374,374002,3,3740005,374002001,3517,0.093546,0.093546,0.568243
56003,MULTIPOLYGON (((-91211.75984021231 -3349104.35...,551,23626,2075843000.0,855312500.0,491248300.0,326,326,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760031,376002001,3517,0.092693,0.092693,0.277815
56008,MULTIPOLYGON (((-68555.59171155476 -3393588.14...,551,23551,2075843000.0,520783100.0,193609600.0,164,164,1,Northern Cape,...,Thembelihle NU,374,374002,3,3740003,374002001,3517,0.046631,0.046631,0.087328
56010,MULTIPOLYGON (((-92877.05129965383 -3387927.47...,551,23548,2075843000.0,71010080.0,102565.8,135,135,1,Northern Cape,...,Hopetown SP,374,374001,3,3740001,374001006,3517,0.038385,0.038385,0.071886


In [119]:
df_int_aea[df_int_aea.index_PP==551]

Unnamed: 0,geometry,index_PP,index_SAL,area_pp,area_sal,area_inter,popu_inter,popu_sal,murd_cnt,province,...,SP_NAME,MN_CODE,MP_CODE,PR_CODE,SAL_CODE,SP_CODE,popu_frac_per_pp,murd_est_per_int,ratio_per_int,murd_est_per_sal
12624,MULTIPOLYGON (((-68566.15538511297 -3393601.23...,551,23554,2075843000.0,578896500.0,2520.749,0.002652,609,1,Northern Cape,...,Thembelihle NU,374,374002,3,3740011,374002001,750.649435,4e-06,4e-06,0.174968
12640,POLYGON ((-66737.06515200126 -3393564.16996531...,551,24662,2075843000.0,190850700.0,37167.5,0.093089,478,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600037,460004001,750.649435,0.000124,0.000124,0.000388
12644,POLYGON ((-63443.14436415431 -3387426.23596726...,551,24648,2075843000.0,314182400.0,47235.89,0.025709,171,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600004,460004001,750.649435,3.4e-05,3.4e-05,3.4e-05
12646,POLYGON ((-44493.3621372392 -3351749.539519794...,551,23614,2075843000.0,1049865000.0,871206600.0,193.349754,233,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760018,376002001,750.649435,0.257577,0.257577,0.272762
12651,POLYGON ((-59950.77325334126 -3380883.27168252...,551,24656,2075843000.0,551528100.0,158177.5,0.082598,288,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600015,460004001,750.649435,0.00011,0.00011,0.033179
12655,POLYGON ((-47553.77612615489 -3357547.14320729...,551,24647,2075843000.0,352007000.0,36017.31,0.01678,164,1,Northern Cape,...,Letsemeng NU,460,460004,4,4600003,460004001,750.649435,2.2e-05,2.2e-05,0.062639
27095,POLYGON ((-110781.5374389185 -3372630.60518009...,551,23553,2075843000.0,2911161000.0,14887.74,0.001683,329,1,Northern Cape,...,Thembelihle NU,374,374002,3,3740005,374002001,750.649435,2e-06,2e-06,0.114321
39336,POLYGON ((-61954.17405946804 -3341200.14737824...,551,23617,2075843000.0,807266400.0,277659700.0,83.579976,243,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760021,376002001,750.649435,0.111344,0.111344,0.190883
55613,POLYGON ((-65630.1324002433 -3349495.455576321...,551,23607,2075843000.0,240731500.0,240696300.0,199.970746,200,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760010,376002001,750.649435,0.266397,0.266397,0.266407
55660,MULTIPOLYGON (((-91211.75984021231 -3349104.35...,551,23626,2075843000.0,855312500.0,491248300.0,187.237923,326,1,Northern Cape,...,Siyancuma NU,376,376002,3,3760031,376002001,750.649435,0.249435,0.249435,0.298794


### Add gender data:

In [None]:
full_pop = pd.read_csv('data/sal_pop.csv')

In [None]:
def get_ratio(i,full_pop):
 
 try: 
    x = int(full_pop.iloc[i,].Female)/(int(full_pop.iloc[i,].Male)+int(full_pop.iloc[i,].Female)) 
 
 except: 
    x =0

 return x
    

In [None]:
wom_ratio = [get_ratio(i,full_pop) for i in range(len(full_pop))]

In [None]:
full_pop['wom_ratio'] = wom_ratio

In [None]:
full_pop.drop('Male', axis=1, inplace=True)

In [None]:
data_full = pd.merge(df_int_aea, full_pop, on='SAL_CODE')

In [None]:
data_full.head()

WARDS:

In [None]:
wardsShp =gpd.GeoDataFrame.from_file('../maps/data/Wards2011_aea.shp')

In [None]:
wardsShp.head(n=2)

In [None]:
za_province = gpd.GeoDataFrame.from_file('../south_africa_adm1.shp')#.set_index('id')


In [None]:
%matplotlib inline

In [None]:
#import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch
import fiona
from shapely.geometry import Polygon, MultiPolygon, shape

# We can extract the London Borough boundaries by filtering on the AREA_CODE key
mp = MultiPolygon(
    [shape(pol['geometry']) for pol in fiona.open('../south_africa_adm1.shp')])
mpW = MultiPolygon(
    [shape(pol['geometry']) for pol in fiona.open('../wards_delimitation/Wards_demarc/Wards2011.shp')])
mpS = MultiPolygon(
    [shape(pol['geometry']) for pol in fiona.open('shapefiles/oryginal/SAL_SA_2013.shp')])

# define map extent
lllon = 21
lllat = -18
urlon = 34
urlat = -8

# set up Basemap instance
m = Basemap(
    projection = 'merc',
    llcrnrlon = lllon, llcrnrlat = lllat, urcrnrlon = urlon, urcrnrlat = urlat,
    resolution='h')
# We can now do GIS-ish operations on each borough polygon!
# we could randomize this by dumping the polygons into a list and shuffling it
# or we could define a random colour using fc=np.random.rand(3,)
# available colour maps are here: http://wiki.scipy.org/Cookbook/Matplotlib/Show_colormaps
cm = plt.get_cmap('RdBu')
num_colours = len(mpW)
 
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111)
minx, miny, maxx, maxy = mp.bounds
w, h = maxx - minx, maxy - miny
ax.set_xlim(minx - 0.2 * w, maxx + 0.2 * w)
ax.set_ylim(miny - 0.2 * h, maxy + 0.2 * h)
ax.set_aspect(1)

patches = []
for idx, p in enumerate(mp):
    #colour = cm(1. * idx / num_colours)
    patches.append(PolygonPatch(p, alpha=1., zorder=1))

for idx, p in enumerate(mpW):
    colour = cm(1. * idx / num_colours)
    patches.append(PolygonPatch(p, ec='#4C4C4C', alpha=1., zorder=1))

for idx, p in enumerate(mpS):
    colour = cm(1. * idx / num_colours)
    patches.append(PolygonPatch(p, ec='#4C4C4C', alpha=1., zorder=1))

ax.add_collection(PatchCollection(patches, match_original=True))

ax.set_xticks([])
ax.set_yticks([])
plt.title("SAL on Wards")
#plt.savefig('data/london_from_shp.png', alpha=True, dpi=300)
plt.show()

In [None]:
# define map extent
lllon = 15
lllat = -35
urlon = 33
urlat = -22

# set up Basemap instance
m = Basemap(
    projection = 'merc',
    llcrnrlon = lllon, llcrnrlat = lllat, urcrnrlon = urlon, urcrnrlat = urlat,
    resolution='h')

In [None]:
fig = plt.figure(figsize=(16, 16))
m.drawmapboundary(fill_color=None, linewidth=0)
m.drawcoastlines(color='#4C4C4C', linewidth=0.5)
m.drawcountries()
m.fillcontinents(color='#F2E6DB',lake_color='#DDF2FD')
#m.readshapefile('../wards_delimitation/Wards_demarc/Wards2011.sbh','Wards',drawbounds=False)
m.readshapefile('../maps/data/test','wards',drawbounds=False)

In [None]:
from itertools import chain
shp = fiona.open('../maps/data/test.shp')
bds = shp.bounds
shp.close()
extra = 0.01
ll = (bds[0], bds[1])
ur = (bds[2], bds[3])
coords = list(chain(ll, ur))
w, h = coords[2] - coords[0], coords[3] - coords[1]

In [None]:
m = Basemap(
    projection='tmerc',
    lon_0=24.000,
    lat_0=-24.0000,
    ellps = 'WGS84',
    llcrnrlon=coords[0] - extra * w,
    llcrnrlat=coords[1] - extra + 0.01 * h,
    urcrnrlon=coords[2] + extra * w,
    urcrnrlat=coords[3] + extra + 0.01 * h,
    lat_ts=0,
    resolution='i',
    suppress_ticks=True)
m.readshapefile(
    '../maps/data/test',
    'wards',
    color='none',
    zorder=2)

clean the utf problems

In [None]:
from unidecode import unidecode

with fiona.open(
        '../maps/data/wards_sel.shp', 'r') as source:

    # Create an output shapefile with the same schema,
    # coordinate systems. ISO-8859-1 encoding.
    with fiona.open(
            '../maps/data/wards_sel_cleaned.shp', 'w',
            **source.meta) as sink:

        # Identify all the str type properties.
        str_prop_keys = [
            k for k, v in sink.schema['properties'].items()
                if v.startswith('str')]

        for rec in source:

            # Transliterate and update each of the str properties.
            for key in str_prop_keys:
                val = rec['properties'][key]
                if val:
                    rec['properties'][key] = unidecode(val)

            # Write out the transformed record.
            sink.write(rec)

In [None]:
salSHP = 'shapefiles/updated/sal_population_4326.shp'
warSHP = '../wards_delimitation/Wards_demarc/Wards2011.shp'

geo_war = gpd.GeoDataFrame.from_file(warSHP)
geo_sal = gpd.GeoDataFrame.from_file(salSHP)

In [None]:
import pyepsg

pyepsg.get(geo_war.crs['init'].split(':')[1])

In [None]:
pyepsg.get(geo_sal.crs['init'].split(':')[1])

 to plot the data on a folium map, we need to convert to a Geographic coordinate system with the wgs84 datum (EPSG: 4326). We also need to greate a GeoJSON object out of the GeoDataFrame.
 AND! as it turns out (many hourse of tripping over the problem) to SIMPLIFY the geometries. They are too big for webmaps.

In [None]:
warSHP = '../maps/data/Wards2011.shp'

geo_war = gpd.GeoDataFrame.from_file(warSHP)
#geo_sal = gpd.GeoDataFrame.from_file(salSHP_upd)


In [None]:
geo_war.head(n=2)

In [None]:
geo_war_sub = geo_war.iloc[:,[2,3,7,8,9]].reset_index().head(n=2)

In [None]:
#g = geo_war_sub.simplify(0.05, preserve_topology=False)

In [None]:
geo_war_sub.head(n=3)

In [None]:
geo_war_sub.to_file('../maps/data/wards_sel.shp')

In [None]:
geo_war_sub['geometry'].replace(g,inplace=True)
#data['index_rank'].replace(index_dict, inplace=True)

In [None]:
geo_war_sub_sim.head(n=2)

In [None]:
salSHP = 'shapefiles/updated/sal_population.shp'
geo_sal = gpd.GeoDataFrame.from_file(salSHP)

In [None]:
#geo_sal.head(n=2)
geo_sal_sub = geo_sal.iloc[:,[7,11,15,16,20,23]].reset_index()#.head()

In [None]:
geo_sal_sub.to_file('../maps/data/sal_sub.shp')

In [None]:
#gjsonSal = geo_sal.to_crs(epsg='4326').to_json()# no need to convert, as it already is in 4326
#gjsonSal = geo_sal.to_json()
#gjsonWar = geo_war.to_json()
gj = g.to_json()

In [None]:
import folium
#import pandas as pd

lllon = 15
lllat = -35
urlon = 33
urlat = -22
#state_geo = r'shapefiles/updated/sal_population.json'
#ward_path = r'../maps/data/test.geojson'

#state_geo = r'shapefiles/oryginal/SAL_SA_2013.json'
state_geo = r'../maps/data/sal.json'
#state_geo = r'temp_1E-7.topojson'

#Let Folium determine the scale
map = folium.Map(location=[(lllat+urlat)/2, (lllon+urlon)/2], tiles='Mapbox Bright',zoom_start=6)
#,              tiles='cartodbpositron')
#map.geo_json(geo_path=state_geo)
#map.geo_json(geo_path=state_geoW)
#map.geo_json(geo_path=ward_path)

map.create_map(path='test.html')

In [None]:
state_geo

In [None]:
lllon = 15
lllat = -35
urlon = 33
urlat = -22

import folium
#map = folium.Map(location=[-33.9249, 18.4241], zoom_start=10)

mapa = folium.Map([(lllat+urlat)/2, (lllon+urlon)/2],
                  zoom_start=7,
                  tiles='cartodbpositron')

#pSal = folium.features.GeoJson(gjsonSal)
#pWae = folium.features.GeoJson(gjsonWar)

#mapa.add_children(pSal)
#mapa.add_children(pWar)
#mapa.geo_json(gj)
#test = folium.folium.Map.geo_json(gj)
#ice_map.geo_json(geo_path=topo_path, topojson='objects.antarctic_ice_shelf')
#mapa.add_children(test)
mapa.create_map(path='test.html')

In [None]:
testshp = '../maps/data/test.shp'
geo_test = gpd.GeoDataFrame.from_file(testshp)

In [None]:
import pyepsg
pyepsg.get(geo_test.crs['init'].split(':')[1])

In [None]:
gjson = geo_test.to_json()

In [None]:
import folium
geo_path = r'../maps/data/test.json'
map_osm = folium.Map(location=[-24.5236, 24.6750],zoom_start=6)
map_osm.geo_json(geo_path=geo_path)
map_osm.create_map(path='osm.html')

analytics based on intersections:

In [None]:
def find_intersections(o):
    
    from collections import defaultdict

    paired_ind = [o.pp_index, o.sal_index]

    d_over_ind = defaultdict(list)

    # creating a dictionary that has prescints as keys and associated small areas as values
    for i in range(len(paired_ind[0].values)):
        if not paired_ind[0].values[i]==paired_ind[1].values[i]: # it shows itself as intersection
            d_over_ind[paired_ind[0].values[i]].append(paired_ind[1].values[i])

    # get rid of the pol precincts with no small areas associated to them- not the most efficient way
    d_temp = {}
    for l in d_over_ind:
        if len(d_over_ind[l]):
            d_temp[l] = d_over_ind[l]

    return d_temp
    
    
def calculate_join_indices(g1_reind, g2_reind):
        out = sjoin(g1_reind, g2_reind, how ="inner", op = "intersects")
        
        out.drop('index_right', axis=1, inplace=True) 
        dict_over_ind = find_intersections(out) 
        
        return dict_over_ind
    

In [None]:
#warSHP = '../maps/data/Wards2011_aea.shp'

#geo_war = gpd.GeoDataFrame.from_file(warSHP)

#salSHP = 'shapefiles/updated/sal_population_aea.shp'
#geo_sal = gpd.GeoDataFrame.from_file(salSHP)
#geo_sal = geo_sal.reset_index()

#geo_war_sub = geo_war.iloc[:,[2,3,7,8,9]].reset_index()#.head(n=2)
out = sjoin(geo_war_sub, geo_sal, how ="inner", op = "intersects")
        

In [None]:
out_sub = out.iloc[:,[2,3,5,6,15,23,24,28]].reset_index().rename(columns={'index':'index_ward','index_right':'index_sal'})

In [None]:
geo_war_sub = geo_war_sub.rename(columns={'index':'index_ward'})#head(n=2)
#head(n=2)
geo_sal_sub = geo_sal.iloc[:,[5,11,16,17,19,21,24]].reset_index().rename(columns={'index':'index_sal'})                               

In [None]:
from collections import defaultdict

paired_ind = [out_sub.index_ward, out_sub.index_sal]

dict_temp = defaultdict(list)

    # creating a dictionary that has prescints as keys and associated small areas as values
for i in range(len(paired_ind[0].values)):
        if not paired_ind[0].values[i]==paired_ind[1].values[i]: # it shows itself as intersection
            dict_temp[paired_ind[0].values[i]].append(paired_ind[1].values[i])

dict_int_ward = {}
for l in dict_temp:
        if len(dict_temp[l]):
            dict_int_ward[l] = dict_temp[l]
    
#dict_int_ward

In [None]:
def calculate_join_ward_sal(dict_over_ind, g1_reind, g2_reind):
        area_total = 0
        data_aggreg = []

        # note to self: make sure to import shapely Polygon
        for index1, row in g1_reind.iterrows():
            #print(index1, row.index_ward)
            try:
                index1 = row.index_ward
               
                sals_found = dict_over_ind[index1]
                for sal in range(len(sals_found)):
                    pom = g2_reind[g2_reind.index_sal == sals_found[sal]]['geometry']        

                    area_int = pom.intersection(row['geometry']).area.values[0]                    
                    
                    area_sal = pom.values[0].area
                    int_percent = area_int/area_sal
                    #popu_count = g2_reind[g2_reind.sal_index == sals_found[sal]]['PPL_CNT'].values[0]
                    
                   
                    extra_info_col = ['MP_NAME','PR_NAME','SAL_CODE','SP_NAME']

                    extra_names = g2_reind[g2_reind.index_sal == sals_found[sal]][extra_info_col]#.filter(regex=("NAME"))

                    #extra_names = g2_reind[g2_reind.sal_index == sals_found[sal]][extra_info_col_names]#.filter(regex=("NAME"))

                    data_aggreg.append({'geometry': pom.intersection(row['geometry']).values[0],\
                                        'id1': index1,'ward_id': row.WARD_ID,'id2': sals_found[sal] ,'area_int': area_int,\
                                        'area_sal': area_sal,'int_percent': int_percent,\
                                  'MP_NAME': extra_names.MP_NAME.values[0],\
                                  'PR_NAME': extra_names.PR_NAME.values[0],'SAL_CODE': extra_names.SAL_CODE.values[0],\
                                  'SP_NAME': extra_names.SP_NAME.values[0]} )
                                    
            except:
                pass
         
        cols=['geometry', 'id1','ward_id','id2','area_int','area_sal','int_percent','MP_NAME','PR_NAME','SAL_CODE','SP_NAME']    
        df_t = gpd.GeoDataFrame(data_aggreg,columns=cols)
        #df_t.to_file('shapefiles/sal_ward.shp')
        return df_t
           

In [None]:
from timeit import default_timer as timer

start = timer() 
df = calculate_join_ward_sal(dict_int_ward,geo_war_sub, geo_sal_sub)
end = timer()
print("time: ", end - start)  

In [None]:
df.head()

In [None]:
df.to_csv('df.csv')

In [None]:
df_nc = df[df.int_percent<1]
#df.groupby(by=['ward_id']).sum()

In [None]:
s = df_nc.groupby(by=['PR_NAME','ward_id'])

In [None]:
type(s)

In [None]:
#There are 4277 wards
len(geo_war)

In [None]:
# all wards have intersections
len(set(df_nc.ward_id))

In [None]:
#84907 SAL areas
len(geo_sal_sub)

In [None]:
# half of the intersect
len(set(df_nc.SAL_CODE))

40515 out of 84907 SALs intersect ward borders.
Let's see whether the intersections generated from PP and SAL fit better.

In [None]:
#trying the intersections
geo_int_p = pd.read_csv('data/pp_int_intersections.csv')

In [None]:
geo_war_sub.crs

In [None]:
#geo_int.head(n=2)
geo_int = gpd.GeoDataFrame(geo_int_p, crs=geo_war_sub.crs)

In [None]:
#geo_int.head(n=2)
cols = [c for c in geo_int.columns if c.lower()[:7] != 'unnamed']
geo_int = geo_int[cols]

In [None]:
geo_int.head(n=2)
geo_int_sub = geo_int.iloc[:,[1,2,0]].reset_index().rename(columns={'index':'index_int'})                            

In [None]:
geo_sal_sub.head(n=1)

In [None]:
geo_int_sub.geometry.head()

In [None]:
geo_war_sub.head(n=2)

In [None]:
out = sjoin(geo_war_sub.head(n=1), geo_int_sub, how ="inner", op = "intersects")


In [None]:
geo_war_sub.head(n=2)

In [None]:
type(geo_int)

In [None]:
geo_int.crs

In [None]:
test = gpd.GeoDataFrame(pd.read_csv('data/pp_test2.csv'))

In [None]:
geo_war_sub.to_csv('auch.csv')

In [None]:
test.plot()

In [None]:
f,ax = plt.subplots(1)
gpd.plotting.plot_multipolygon(ax, df_int.head(n=2).geometry.values[0], linewidth = 0.1, edgecolr='grey')
plt.show()

In [None]:
df_int.head(n=2).geometry.values[0]