***Objective***

The objective of this competition is to create a machine learning model to predict which individuals are most likely to have or use a bank account. The models and solutions developed can provide an indication of the state of financial inclusion in Kenya, Rwanda, Tanzania and Uganda, while providing insights into some of the key demographic factors that might drive individuals’ financial outcomes.


In [1]:
#load libraries
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import StandardScaler  
from scipy import sparse
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
import random

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [14]:
#load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
description_data = pd.read_csv("VariableDescription.csv")

In [5]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [6]:
test.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [7]:
sample_submission.head() 

Unnamed: 0,uniqueid,bank_account
0,uniqueid_7867 x Kenya,1.0
1,uniqueid_6722 x Kenya,0.0
2,uniqueid_6714 x Kenya,1.0
3,uniqueid_8103 x Kenya,1.0
4,uniqueid_8657 x Kenya,1.0


In [8]:
description_data.head()

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"


In [9]:
train.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [15]:
#uniqueid + " x " + country name
train['uniqueid'] = train['uniqueid'] + " x " + train['country']
test['uniqueid'] = test['uniqueid'] + " x " + test['country']

In [16]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1 x Kenya,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2 x Kenya,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3 x Kenya,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4 x Kenya,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5 x Kenya,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [None]:
#non categorical = age_of_respondent

In [18]:
test_uniqueid = test['uniqueid']
test_uniqueid.reset_index(drop=True, inplace=True)

In [25]:
train = train[['country','year','bank_account','location_type','cellphone_access','household_size',
               'age_of_respondent','gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']]

In [26]:
test = test[['country','year','location_type','cellphone_access','household_size',
               'age_of_respondent','gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']]

In [27]:
train_categorical = ['country','year','location_type','cellphone_access','household_size',
               'gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']
test_categorical = ['country','year','location_type','cellphone_access','household_size',
               'gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']

In [42]:
train["bank_account"] = train["bank_account"].astype('category')
train["bank_account"] = train["bank_account"].cat.codes

In [43]:
train.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,0,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,1,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,0,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,0,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [44]:
#Catboost
#split training data set
X_train1 = train.drop(['bank_account'], axis=1)
y_train1 = train.bank_account

#split test dataset
X_test1 = test
#y_test1 = df_test_cat.YIELD

def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

categorical_features_indices = column_index(X_train1, train_categorical)
categorical_features_indices1 = column_index(X_test1, test_categorical)


In [None]:
from catboost import CatBoostClassifier
from sklearn import metrics
#import cb as catboost

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [300]}

cb = CatBoostClassifier()
cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3)
#cb_model.fit(X_train1, y_train1)

#Without Categorical features
#clf = cb.CatBoostClassifier(eval_metric="AUC", depth=10, iterations= 500, l2_leaf_reg= 9, learning_rate= 0.15)
#clf.fit(X_train1,y_train1)
#auc(clf, train, test)

#With Categorical features
clf = CatBoostClassifier(eval_metric="AUC",one_hot_max_size=31, depth=10, iterations= 500, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(X_train1,y_train1, cat_features= categorical_features_indices)
auc(clf, X_train1, y_train1)

0:	total: 44.3ms	remaining: 22.1s
1:	total: 310ms	remaining: 1m 17s
2:	total: 625ms	remaining: 1m 43s
3:	total: 697ms	remaining: 1m 26s
4:	total: 736ms	remaining: 1m 12s
5:	total: 974ms	remaining: 1m 20s
6:	total: 1.18s	remaining: 1m 23s
7:	total: 1.41s	remaining: 1m 26s
8:	total: 1.62s	remaining: 1m 28s
9:	total: 1.68s	remaining: 1m 22s
10:	total: 1.8s	remaining: 1m 20s
11:	total: 1.97s	remaining: 1m 19s
12:	total: 2.19s	remaining: 1m 22s
13:	total: 2.33s	remaining: 1m 20s
14:	total: 2.38s	remaining: 1m 16s
15:	total: 2.46s	remaining: 1m 14s
16:	total: 2.5s	remaining: 1m 11s
17:	total: 2.57s	remaining: 1m 8s
18:	total: 2.72s	remaining: 1m 8s
19:	total: 2.79s	remaining: 1m 7s
20:	total: 2.83s	remaining: 1m 4s
21:	total: 2.88s	remaining: 1m 2s
22:	total: 2.92s	remaining: 1m
23:	total: 2.98s	remaining: 59.2s
24:	total: 3.02s	remaining: 57.4s
25:	total: 3.24s	remaining: 59.1s
26:	total: 3.28s	remaining: 57.5s
27:	total: 3.35s	remaining: 56.6s
28:	total: 3.39s	remaining: 55s
29:	total: 3.6

In [None]:
#light gbm
#split training data set
X_train2 = train.drop(['bank_account'], axis=1)
y_train2 = train.bank_account

#split test dataset
X_test2 = test

In [None]:
import lightgbm as lgb
from sklearn import metrics

def auc2(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict(train)),
                            metrics.roc_auc_score(y_test,m.predict(test)))

lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [25,50, 75],
              "learning_rate" : [0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200]
             }
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=5)
grid_search.fit(X_train2,y_train2)
grid_search.best_estimator_

d_train = lgb.Dataset(X_train2, label=y_train2)
params = {"max_depth": 50, "learning_rate" : 0.01, "num_leaves": 900,  "n_estimators": 300}

# Without Categorical Features
#model2 = lgb.train(params, d_train)
#auc2(model2, train, test)

#With Catgeorical Features
model2 = lgb.train(params, d_train, categorical_feature = train_categorical)
auc2(model2, train, test)