# Regional Features


Given lat,lon coordinates calculate closest Region, District & Ward customer belongs to

In [16]:
import pandas as pd
import numpy as np

from pysal.lib.cg import KDTree, RADIUS_EARTH_KM

## Load Data

### Customer Coordinates

In [17]:
# Read customer coordinates
customer_coords = pd.read_csv('../../data/raw/training.csv', index_col=0, usecols=(0, 30, 31))
customer_coords.rename(columns={'Latitude': 'lat', 'Longitude': 'lon'}, inplace=True)
customer_coords.head()

Unnamed: 0_level_0,lat,lon
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
5086,-4.460442,29.811396
1258,-6.176438,39.244871
331,-6.825702,37.652798
6729,-3.372049,35.808307
8671,-7.179645,31.039095


### Region, district and ward coordinates

In [18]:
# Regions
regions_df = pd.read_csv('../../data/processed/regions_coords.csv', index_col=0)
regions_df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)
regions_list = list(regions_df.index)

# Districts
district_df = pd.read_csv('../../data/processed/district_coords.csv', index_col=0)
district_df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)
districts_list = list(district_df.index)

# Districts
wards_df = pd.read_csv('../../data/processed/ward_coords.csv', index_col=0)
wards_df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)
wards_list = list(wards_df.index)

regions_df.head()

Unnamed: 0_level_0,lat,lon
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Arusha,-3.352761,36.615069
Dar es Salaam,-6.819873,39.225717
Dodoma,-6.06938,35.872637
Iringa,-8.501689,35.025756
Kagera,-1.907718,31.572764


## Create KDTree and select closest

In [19]:
class ArcTreeMatcher():
    """
    
    """
    
    def __init__(self, return_dist=True, n=1):
        self.return_dist = return_dist
        self.n = n
       
        
    def _check_df(self, df):
        assert 'lat' in list(df), '"lat" column not found'
        assert 'lon' in list(df), '"lon" column not found'
    
    
    def create_tree(self, dest_df):
        print('creating new tree')
        self._check_df(dest_df)
         # Create tree
        self.tree = KDTree(dest_df[['lat','lon']].values, distance_metric='ARC', radius=RADIUS_EARTH_KM)
        
    
    def match_closest(self, lat, lon):
        
        coords = np.array([lat,lon])
        
        dist, idx = self.tree.query(coords, self.n)
        
        if self.return_dist:
            return idx, dist
        else:
            return idx
        
    def do_match(self, source_df, dest_df, prefix='dest'):
        # Check input data
        self._check_df(source_df)
        self._check_df(dest_df)
        
        self.create_tree(dest_df)
        
        df = source_df.copy()
        
        df[f'{prefix}_id'], df[f'{prefix}_distance'] = zip(*map(self.match_closest, source_df.lat, source_df.lon))
    
        return df
        

In [20]:
tree_matcher = ArcTreeMatcher(return_dist=True, n=1)

### Region

In [21]:
customer_coords = tree_matcher.do_match(customer_coords, regions_df, prefix='region')
customer_coords['region'] = customer_coords.region_id.apply(lambda id: regions_list[id])
customer_coords.head()

creating new tree


Unnamed: 0_level_0,lat,lon,region_id,region_distance,region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5086,-4.460442,29.811396,5,24.63922,Kigoma
1258,-6.176438,39.244871,21,24.238709,Zanzibar
331,-6.825702,37.652798,11,33.490434,Morogoro
6729,-3.372049,35.808307,0,89.724488,Arusha
8671,-7.179645,31.039095,15,40.15018,Rukwa


### District

In [22]:
customer_coords = tree_matcher.do_match(customer_coords, district_df, prefix='district')
customer_coords['district'] = customer_coords.district_id.apply(lambda id: districts_list[id])
customer_coords.head()

creating new tree


Unnamed: 0_level_0,lat,lon,region_id,region_distance,region,district_id,district_distance,district
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5086,-4.460442,29.811396,5,24.63922,Kigoma,28,24.600008,Kasulu
1258,-6.176438,39.244871,21,24.238709,Zanzibar,123,24.238709,Zanzibar
331,-6.825702,37.652798,11,33.490434,Morogoro,70,1.700766,Morogoro Urban
6729,-3.372049,35.808307,0,89.724488,Arusha,27,16.123696,Karatu
8671,-7.179645,31.039095,15,40.15018,Rukwa,91,29.2643,Nkansi


### Ward

In [24]:
customer_coords = tree_matcher.do_match(customer_coords, wards_df, prefix='ward')
customer_coords['ward'] = customer_coords.ward_id.apply(lambda id: wards_list[id])
customer_coords.head()

creating new tree


Unnamed: 0_level_0,lat,lon,region_id,region_distance,region,district_id,district_distance,district,ward_id,ward_distance,ward
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5086,-4.460442,29.811396,5,24.63922,Kigoma,28,24.600008,Kasulu,73,25.115583,Heru Ushingo
1258,-6.176438,39.244871,21,24.238709,Zanzibar,123,24.238709,Zanzibar,248,2.650153,Magogoni
331,-6.825702,37.652798,11,33.490434,Morogoro,70,1.700766,Morogoro Urban,246,0.332536,Mafiga
6729,-3.372049,35.808307,0,89.724488,Arusha,27,16.123696,Karatu,343,5.875602,Mto wa Mbu
8671,-7.179645,31.039095,15,40.15018,Rukwa,91,29.2643,Nkansi,118,218.027062,Kaliua
