Noteboook to test the Fuzzywuzzy library https://github.com/seatgeek/fuzzywuzzy for geolocation of the survey data.

In [None]:
import pandas as pd
import geopandas as gpd
import os
from fuzzywuzzy import fuzz
import numpy as np

In [None]:
data_dir = os.environ['AA_DATA_DIR']
df_srv = pd.read_csv(os.path.join(data_dir, 'exploration', 'bangladesh', 'CDP_Survey', 'secondround_locations.csv'))
df_srv['C04_mouza_name']=df_srv['C04_mouza_name'].replace('\*','',regex=True).astype(str)
df_shp = gpd.read_file(os.path.join(data_dir, 'exploration', 'bangladesh', 'ADM_Shp', 'selected_distict_mauza.shp'))

In order to reduce duplicates we need to use the full combination df admin names
In the shapefile even the combination of Mauza, Union, Upazilla and District is still not always unique.
This is in many cases due to issues in the inputs shapefile. It doesn't impact too much the statistical analysis as they would be in the same area.

In [None]:
duplicate_mozas=df_shp[['MAUZNAME','UNINAME','THANAME','DISTNAME']].value_counts()
duplicate_mozas=duplicate_mozas.where(duplicate_mozas>1).dropna()
print(duplicate_mozas)

We create a new string that is the combination of Mauza, Union, Upazilla and District names, both for the survey data and for the shapefile 

In [None]:
cols_shp=['MAUZNAME','UNINAME','THANAME','DISTNAME']
cols_srv=['C04_mouza_name','C04_union_name','C04_upazila_name','C04_district_name']

df_shp[cols_shp]=df_shp[cols_shp].astype(str)
df_srv[cols_srv]=df_srv[cols_srv].astype(str)

df_shp['shp_full_name']= df_shp[cols_shp].agg(','.join, axis=1)
df_srv['srv_full_name']= df_srv[cols_srv].agg(','.join, axis=1)

In [None]:
def get_best_fuzzy_match(srv_name,df_shp,threshold=70):
# returns the best match for srv_name among all the features in the shapefile
# the features with the highest score above the threshold is returned
# if there are multiple matches it selects the feature with the largest area
# if there are no matching features returns None
# the fuzzy matching is done usind scores from https://github.com/seatgeek/fuzzywuzzy
    
    # calculate all fuzzy scores relative to srv_name
    scores = df_shp['shp_full_name'].apply(lambda x: fuzz.ratio(x,srv_name))
    # create a dataframe to perform some operations
    data = pd.DataFrame(
            {'shp_id': df_shp['OBJECTID'],
            'shp_area': df_shp['Shape_Area'],
            'shp_full_name': df_shp['shp_full_name'],
            'fuzzy_score': scores}
            )
    max_score=data['fuzzy_score'].max()
    if max_score<threshold:
        return {
                'srv_full_name':srv_name,
                'n_matches':0,
                'matching_score':max_score
                }
    # get only results with maximum scores
    data=data.loc[data['fuzzy_score']==max_score]
    n_matches=len(data)
    # if there are multiple choices we get the largest one
    data=data.loc[data['shp_area'].idxmax()]
    return {
            'srv_full_name':srv_name,
            'shp_id':data['shp_id'],
            'shp_full_name':data['shp_full_name'],
            'n_matches':n_matches,
            'matching_score':max_score
            }

Using the function defined above we can now match the survey names (srv_name) with the corresponding name in teh shapefile and the OBJECTID of the shapefile feature
We are only matching the set of survey names to speed up computing time

In [None]:
set_srv_full_name=df_srv['srv_full_name'].unique()
matching_df=pd.DataFrame()
# TODO put in function
for srv_full_name in set_srv_full_name:
    matching_df=matching_df.append(get_best_fuzzy_match(srv_full_name,df_shp),ignore_index=True)

We can finally do a left join on the survey data to get the OBJECTID of the shapefile associated

In [None]:
matched_srv=pd.merge(left=df_srv,right=matching_df,left_on='srv_full_name',right_on='srv_full_name',how='left')

# check total #of rows
print(f'total rows:{len(matched_srv)}')

# check multiple matches
print('multiple matches: {}'.format(len(matched_srv[matched_srv['n_matches']>1])))

# check no matches
print('No matches: {}'.format(len(matched_srv[matched_srv['n_matches']==0])))


We can finally merge back the shapefile and see # of people per union and district to see if it matches the inputs
This is done using the RECONSTRUCTED geolocation and it's useful to cross check the quality of the geolocation

In [None]:
matched_srv_shp=pd.merge(left=matched_srv,right=df_shp,left_on='shp_id',right_on='OBJECTID',how='left',suffixes=['','_shp'])
matched_srv_shp=matched_srv_shp.drop(['shp_full_name_shp','geometry'],axis=1)


print('Number of interviewees by district')
print(matched_srv_shp.groupby('DISTNAME').size())

print('Number of interviewees by union')
print(matched_srv_shp.groupby('UNINAME').size())

matched_srv.to_csv(os.path.join(os.path.join(data_dir, 'exploration', 'bangladesh', 'CDP_Survey', 'secondround_locations_matched.csv')))
#writer = pd.ExcelWriter(f'{dir_path}/mauza_data/household_locations_impactevaluation_matched.xlsx', engine='xlsxwriter')
#matched_srv_shp.to_excel(writer,sheet_name='matched_data')
#matched_srv_shp.groupby('DISTNAME').size().to_excel(writer,sheet_name='QA_HH_District')
#matched_srv_shp.groupby('UNINAME').size().to_excel(writer,sheet_name='QA_HH_Union')
#writer.save()