In [7]:
%load_ext autoreload
%autoreload 2

# Mobile Money Agent Features

All features derived from `mobilemoney_agents_for_upload_win.csv` are calculated here, and then added to the `Transformer` in `src/features/mobile_money.py`

In [3]:
import pandas as pd
import numpy as np

from pysal.lib.cg import KDTree, RADIUS_EARTH_KM

import warnings
warnings.simplefilter('ignore')

In [4]:
train_df = pd.read_csv('../../data/raw/training.csv', index_col=0)
train_df.head()

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8_1,Q8_2,Q8_3,...,Q17,Q18,Q19,Latitude,Longitude,mobile_money,savings,borrowing,insurance,mobile_money_classification
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5086,98,2,3,1,1,2,2,0,0,0,...,-1,4,4,-4.460442,29.811396,0,0,0,0,0
1258,40,1,1,3,5,1,1,1,0,0,...,4,1,4,-6.176438,39.244871,1,1,1,0,3
331,18,2,4,6,3,2,1,0,0,0,...,-1,1,1,-6.825702,37.652798,1,0,0,0,2
6729,50,1,1,3,1,1,1,0,0,0,...,-1,1,4,-3.372049,35.808307,1,0,1,0,3
8671,34,1,1,1,1,2,1,0,1,0,...,-1,1,4,-7.179645,31.039095,1,1,0,1,3


In [5]:
mm_df = pd.read_csv('../../data/raw/FSDT_FinAccessMapping/mobilemoney_agents_for_upload_win.csv')
print(mm_df.shape)
mm_df.head()

(45429, 112)


Unnamed: 0,region,district,ward,latitude,longitude,agent_name,till_operator,agent_type,standalone_or_other_business,other_business_type,...,other_local_services,other_local_services_details,branding_but_no_service,branding_but_no_service_mpesa,branding_but_no_service_tigo_pesa,branding_but_no_service_ezy_pesa,branding_but_no_service_airtel_money,branding_but_no_service_max_malipo,branding_but_no_service_selcom,branding_but_no_service_button Pay
0,Ruvuma,Tunduru,Kalulu,-10.28298,40.18363,Yusuph Shop,employee,Aggregated,Standalone agent,,...,,,Yes,Yes,,,,,,
1,Ruvuma,Tunduru,Kalulu,-10.28457,40.18295,Abdulazak Shop,owner,Aggregated,Other business,Duka,...,,,No,,,,,,,
2,Kilimanjaro,Moshi Urban,Bondeni,-3.36025,37.33803,Hussein Ramadhani Shop,employee,Independent,Standalone agent,,...,,,,,,,,,,
3,Dar es Salaam,Kinondoni,Manzese,-6.80213,39.22978,Mechmaster Ltd,employee,Aggregated,Standalone agent,,...,,,,,,,,,,
4,Kilimanjaro,Moshi Urban,Kiusa,-3.35613,37.33382,Bernadeta Mtui,owner,Independent,Other business,Duka,...,,,,,,,,,,


In [6]:
useful_cols_base = [
    'region',
    'district',
    'ward',
    'latitude',
    'longitude',
    'agent_type',
    'standalone_or_other_business',
    'other_business_type',
    'trading_hours'
]


other_cols = [
    'provider_tigo_pesa',
    'deposits_tigo_pesa',
    'withdrawls_tigo_pesa',
    'tigo_pesa_training',
    'tigo_pesa_visible_guidance',
    'provider_airtel_cash',
    'deposits_airtel_cash',
    'withdrawls_airtel_cash',
    'airtel_cash_training',
    'airtel_cash_log_books',
    'provider_ezy_pesa',
     'deposits_ezy_pesa',
    'withdrawls_ezy_pesa',
]

# pick more as this progresses
# list(mm_df)

### Feature: No. of Mobile Money Agents within X km


In [28]:
from sklearn.base import BaseEstimator, TransformerMixin

class MMAgentsInVicinity(BaseEstimator, TransformerMixin):

    def __init__(self, mm_coords, radius=5):
        self.tree = self.create_tree(mm_coords)
        self.radius = radius

    def create_tree(self, coords):
        return KDTree(coords, distance_metric='ARC', radius=RADIUS_EARTH_KM)

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None):
        x_coords = X[['Latitude', 'Longitude']]

        agents_in_radius = x_coords.apply(
            lambda coords: len(self.tree.query_ball_point(coords, r=self.radius)), axis=1)

        return pd.DataFrame(agents_in_radius)
    
    
class ColumnExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self,X, y=None, **transform_params):
        assert self.columns is not None, 'ColumnExtractor initialized without list of columns'
        return X[self.columns]
        

## Test the transformers

In [31]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

def extract_targets(train_df):
    X = train_df.drop(['mobile_money', 'savings', 'borrowing', 'insurance', 'mobile_money_classification'], axis=1)
    y = train_df['mobile_money_classification']
    return X, y

X, y = extract_targets(train_df)
X_train, X_test, y_train, y_test = train_test_split(X, y)

CONTINUOUS_COLUMNS = ['Q1']

mm_coords = mm_df[['latitude','longitude']].values

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnExtractor(CONTINUOUS_COLUMNS)),
            ('scale', StandardScaler())
        ])),
        ('agents_in_vicinity', MMAgentsInVicinity(mm_coords, radius=5))
    ])), 
    ('clf',LogisticRegression())
])
pipeline.fit(X_train, y_train)


print(log_loss(y_test, pipeline.predict_proba(X_test)))

1.2332691551110588
