In [84]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Light GBM Baseline

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
! brew install libomp

Updating Homebrew...
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 3 taps (homebrew/core, homebrew/cask and osgeo/osgeo4mac).
[34m==>[0m [1mUpdated Formulae[0m
[1marpack [32m✔[0m[0m                                 qwt
[1mosgeo/osgeo4mac/osgeo-libgeotiff [32m✔[0m[0m       rancid
[1mosgeo/osgeo4mac/osgeo-libspatialite [32m✔[0m[0m    re2
[1mosgeo/osgeo4mac/osgeo-proj [32m✔[0m[0m             remind
[1msuperlu [32m✔[0m[0m                                sdcc
calicoctl                                sdl2_ttf
corsixth                                 serd
cql                                      ship
cracklib                                 shyaml
dynare                                   smali
etl                                      source-to-image
gtk-gnutella                             spin
hypre                                    sratom
kubernetes-helm                          srt
lilv                                     sslscan
lv2                            

## Load Data and Create Features / Labels

In [19]:
df = pd.read_csv('../../data/raw/training.csv', index_col=0)
# df.replace(-1, np.NaN, inplace=True)

X = df.drop(['mobile_money','savings','borrowing','insurance','mobile_money_classification'], axis=1)
y = df['mobile_money_classification']

X.head()

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8_1,Q8_2,Q8_3,...,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5086,98,2,3,1,1,2,2,0,0,0,...,2,-1,2,-1,1,-1,4,4,-4.460442,29.811396
1258,40,1,1,3,5,1,1,1,0,0,...,1,4,1,5,4,4,1,4,-6.176438,39.244871
331,18,2,4,6,3,2,1,0,0,0,...,2,-1,2,-1,1,-1,1,1,-6.825702,37.652798
6729,50,1,1,3,1,1,1,0,0,0,...,1,2,2,-1,4,-1,1,4,-3.372049,35.808307
8671,34,1,1,1,1,2,1,0,1,0,...,2,-1,1,1,1,-1,1,4,-7.179645,31.039095


## Setup scoring function and baseline parameters

In [40]:
from pprint import pprint
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss

seed = 2019

np.random.seed(seed)

def zindi_score(y_test, probs):
    """
    Score as we are being scored on LB
    """
    losses = []
    for c in [0,1,2,3]:
        labels = [1 if i == c else 0 for i in y_test]
        loss = log_loss(labels, probs[:, c])
        losses.append(loss)
    return sum(losses)


n_classes = y.nunique()

# Params that won't be changed
model_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'metric': ['multiclass'],
    'num_class': n_classes,
    'verbose': 0,
    'seed': 2019
}

# Some default params to start with
starter_params = {
    'max_depth': 8,
    'num_leaves': 2**8,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
# Combine parameters sets into one
lgbm_params = {**model_params, **starter_params}


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=seed,)

d_train = lgb.Dataset(X_train, y_train)
d_val = lgb.Dataset(X_val, y_val, reference=d_train)

print('Train size: ', X_train.shape)
print('Val size: ', X_val.shape)
print('\nLGBM Params:')
pprint(lgbm_params)

Train size:  (5320, 31)
Val size:  (1774, 31)

LGBM Params:
{'bagging_fraction': 0.8,
 'bagging_freq': 5,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9,
 'learning_rate': 0.05,
 'max_depth': 8,
 'metric': ['multiclass'],
 'num_class': 4,
 'num_leaves': 256,
 'objective': 'multiclassova',
 'seed': 2019,
 'verbose': 0}


## Define Training Method and Fixed Params

In [61]:
CAT_VARS = ['Q2','Q3','Q4','Q5','Q6','Q7','Q8_1',
            'Q8_2','Q8_3','Q8_4','Q8_5','Q8_6',
            'Q8_7','Q8_8','Q8_9','Q8_10','Q8_11',
            'Q9','Q10','Q11','Q12','Q13','Q14',
            'Q15','Q16','Q17','Q18','Q19']
CONT_VARS = ['Q1']
LOC_VARS = ['Latitude','Longitude']

NUM_BOOST_ROUND=100
EARLY_STOPPING_ROUNDS=10

def train_evaluate(X, y, params):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1234)

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    model = lgb.train(params, train_data,
                      num_boost_round=NUM_BOOST_ROUND,
                      early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                      verbose_eval=False,
                      valid_sets=[valid_data], 
                      valid_names=['valid'])
    
    return model

## Train Model

In [62]:
model = train_evaluate(X, y, lgbm_params)

print('Best score = ',model.best_score['valid']['multi_logloss'])

val_probs = model.predict(X_val)

print('Zindi Score = ',zindi_score(y_val, val_probs))

Best score =  0.6781779620183997
Zindi Score =  2.1598427697918883


**Not great...but this is the baseline...**

## Create Submission

In [45]:
from src.utils import make_sub

In [46]:
X_test = pd.read_csv('../../data/raw/test.csv', index_col=0)

test_probs = model.predict(X_test)

sub_df = make_sub(test_probs)

sub_df.head()

Unnamed: 0,no_financial_services,other_only,mm_only,mm_plus
2352,0.316213,0.316162,0.35012,0.636292
8208,0.316216,0.324592,0.411304,0.569396
2785,0.316731,0.31804,0.369359,0.629162
2967,0.432228,0.527281,0.322743,0.375517
1697,0.316802,0.316177,0.356312,0.62525


In [47]:
sub_df.to_csv('../../data/submissions/lgbm_baseline.csv')

### Zindi Results

Got a score of 2.09365447154372 on Zindi. 