***Objective***

The objective of this competition is to create a machine learning model to predict which individuals are most likely to have or use a bank account. The models and solutions developed can provide an indication of the state of financial inclusion in Kenya, Rwanda, Tanzania and Uganda, while providing insights into some of the key demographic factors that might drive individuals’ financial outcomes.


In [1]:
#load libraries
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import StandardScaler  
from scipy import sparse
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb
from math import sqrt
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
import random

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [14]:
#load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
description_data = pd.read_csv("VariableDescription.csv")

In [5]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [6]:
test.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [7]:
sample_submission.head() 

Unnamed: 0,uniqueid,bank_account
0,uniqueid_7867 x Kenya,1.0
1,uniqueid_6722 x Kenya,0.0
2,uniqueid_6714 x Kenya,1.0
3,uniqueid_8103 x Kenya,1.0
4,uniqueid_8657 x Kenya,1.0


In [8]:
description_data.head()

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"


In [9]:
train.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

In [15]:
#uniqueid + " x " + country name
train['uniqueid'] = train['uniqueid'] + " x " + train['country']
test['uniqueid'] = test['uniqueid'] + " x " + test['country']

In [16]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1 x Kenya,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2 x Kenya,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3 x Kenya,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4 x Kenya,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5 x Kenya,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [None]:
#non categorical = age_of_respondent

In [18]:
test_uniqueid = test['uniqueid']
test_uniqueid.reset_index(drop=True, inplace=True)

In [19]:
train = train[['country','year','bank_account','location_type','cellphone_access','household_size',
               'age_of_respondent','gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']]

In [21]:
test = test[['country','year','location_type','cellphone_access','household_size',
               'age_of_respondent','gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']]

In [22]:
train_categorical = ['country','year','location_type','cellphone_access','household_size',
               'gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']
test_categorical = ['country','year','location_type','cellphone_access','household_size',
               'gender_of_respondent','relationship_with_head','marital_status',
               'education_level','job_type']

In [None]:
train["body_style"] = obj_df["body_style"].astype('category')

In [None]:
#Catboost
#split training data set
X_train1 = df_train_cat.drop(['YIELD'], axis=1)
y_train1 = df_train_cat.YIELD

#split test dataset
X_test1 = df_test_cat.drop(['YIELD'], axis=1)
#y_test1 = df_test_cat.YIELD

def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

categorical_features_indices = column_index(X_train1, categorical)
categorical_features_indices1 = column_index(X_test1, categorical)


model=CatBoostRegressor(iterations=500, depth=10, learning_rate=0.01, loss_function='RMSE')

In [None]:
#cts

In [None]:
#combine