## Create a model for each of the labels

**Labels:**
* No Fin. Services
* Other Only
* MM Only
* MM Plus

1. Create a model predicting each label individually --> 4 models
2. Save the output probabilities for each model

In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

### Load Data

In [2]:
df = pd.read_csv('../../data/raw/training.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8_1,Q8_2,Q8_3,...,Q17,Q18,Q19,Latitude,Longitude,mobile_money,savings,borrowing,insurance,mobile_money_classification
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5086,98,2,3,1,1,2,2,0,0,0,...,-1,4,4,-4.460442,29.811396,0,0,0,0,0
1258,40,1,1,3,5,1,1,1,0,0,...,4,1,4,-6.176438,39.244871,1,1,1,0,3
331,18,2,4,6,3,2,1,0,0,0,...,-1,1,1,-6.825702,37.652798,1,0,0,0,2
6729,50,1,1,3,1,1,1,0,0,0,...,-1,1,4,-3.372049,35.808307,1,0,1,0,3
8671,34,1,1,1,1,2,1,0,1,0,...,-1,1,4,-7.179645,31.039095,1,1,0,1,3


### Modelling Functions

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

def create_model():
    """
    Return the best model found by tpot
    """
    return make_pipeline(
    StackingEstimator(
        estimator=GradientBoostingClassifier(learning_rate=0.001, 
                                             max_depth=8, 
                                             max_features=0.1, 
                                             min_samples_leaf=7, 
                                             min_samples_split=8, 
                                             n_estimators=100, 
                                             subsample=1.0)
    ),LogisticRegression(C=0.5, dual=False, penalty="l1"))


def score(y_test, probs):
    """
    Score as we are being scored on LB
    """
    losses = []
    for c in [0,1,2,3]:
        labels = [1 if i == c else 0 for i in y_test]
        loss = log_loss(labels, probs[:, c])
        losses.append(loss)
    return sum(losses)

#### Replicate Current best model

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

In [5]:
label='mobile_money_classification'

X = df.drop(['mobile_money','savings','borrowing','insurance','mobile_money_classification'], axis=1)
y = df[label]

model = create_model()
cross_val_score(model, X, y, scoring=make_scorer(score, needs_proba=True))

array([1.3797664 , 1.38754655, 1.39385613])

My current score is around 1.347 so this seems to be reproducing that result

#### Evaluate a classifier on each label

In [6]:
from sklearn.model_selection import cross_validate
from pprint import pprint

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for label in ['mobile_money','savings','borrowing','insurance']:

    X = df.drop(['mobile_money','savings','borrowing','insurance','mobile_money_classification'], axis=1)
    y = df[label]

    model = create_model()
    cv_scores = cross_validate(model, X, y, 
                               cv=skf,
                               scoring=['precision','recall','accuracy'])
    
    print(label)
    pprint(cv_scores)
    print('\n')

mobile_money
{'fit_time': array([1.43078399, 1.65842009, 1.85728788, 1.99999714, 1.55141401]),
 'score_time': array([0.05396318, 0.04777908, 0.04056025, 0.04024982, 0.04374385]),
 'test_accuracy': array([0.91120507, 0.93305144, 0.93305144, 0.92036646, 0.91889986]),
 'test_precision': array([0.94      , 0.96626181, 0.96128171, 0.95411606, 0.95770805]),
 'test_recall': array([0.89694656, 0.91094148, 0.91603053, 0.89949109, 0.89312977]),
 'train_accuracy': array([0.92986784, 0.92405286, 0.92475771, 0.92757709, 0.92864693]),
 'train_precision': array([0.96291301, 0.95658028, 0.95694585, 0.95932773, 0.95879397]),
 'train_recall': array([0.90839695, 0.90394402, 0.90489822, 0.90776081, 0.91030534])}


savings
{'fit_time': array([6.75235724, 4.67682505, 7.368613  , 6.86605906, 6.75174499]),
 'score_time': array([0.04002309, 0.04163003, 0.0406332 , 0.04969811, 0.04028082]),
 'test_accuracy': array([0.65398168, 0.65821001, 0.66032417, 0.64693446, 0.65373766]),
 'test_precision': array([0.6380471

**Results**

* `mobile_money` - **great**
* `savings`, `borrowing` - **average** (low recall)
* `insurance` - **bad** (very low recall)

### Train classifier on each label, predict on test set and save P(y=1)

In [7]:
test_data = pd.read_csv('../../data/raw/test.csv', index_col=0)
test_data.head()

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8_1,Q8_2,Q8_3,...,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2352,30,2,1,3,6,2,1,0,0,1,...,1,6,1,4,5,-1,1,4,-2.596421,32.913584
8208,53,1,1,3,1,2,1,0,1,0,...,2,-1,1,4,1,-1,1,4,-1.778303,34.06264
2785,23,2,1,3,5,1,1,0,0,0,...,1,4,1,4,1,-1,1,1,-1.851789,31.536866
2967,30,2,3,3,4,1,2,0,1,0,...,2,-1,2,-1,1,-1,1,4,-1.235656,34.229012
1697,28,2,1,5,4,2,1,0,0,0,...,1,4,1,6,4,3,1,2,-7.780934,35.691878


In [8]:
# Each column contains probability of that class
test_probs = np.empty(shape=(test_data.shape[0], 4))

for i,label in enumerate(['mobile_money','savings','borrowing','insurance']):
    
    print(label)

    X = df.drop(['mobile_money','savings','borrowing','insurance','mobile_money_classification'], axis=1)
    y = df[label]

    print('\tTraining...')
    model = create_model()
    model.fit(X, y)
    
    print('\tPredicting...')
    probs = model.predict_proba(test_data)
    
    test_probs[:, i] = probs[:, -1]
    
    print('\tDone')
    

mobile_money
	Training...
	Predicting...
	Done
savings
	Training...
	Predicting...
	Done
borrowing
	Training...
	Predicting...
	Done
insurance
	Training...
	Predicting...
	Done


In [9]:
test_probs[:5]

array([[0.999587  , 0.63311995, 0.44652924, 0.0707161 ],
       [0.93866033, 0.67240169, 0.64214163, 0.15301042],
       [0.99467771, 0.76470902, 0.66600757, 0.11461147],
       [0.06387194, 0.80666656, 0.52821159, 0.07809082],
       [0.99945984, 0.8120697 , 0.69361752, 0.11636067]])

## Train MLP To Predict `mobile_money_classification` from other flags

In  `mlp_classes.ipynb` I showed that this MLP can predict the class from individual flags with 100% accuracy.

In [13]:
from sklearn.neural_network import MLPClassifier

X = df[['mobile_money','savings','borrowing','insurance']]
y = df['mobile_money_classification']

model = MLPClassifier(hidden_layer_sizes=(100,4))
model.fit(X, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Predict classes using MLP

In [20]:
test_preds = model.predict_proba(test_probs)

test_preds[:5]

array([[6.63884315e-08, 4.76610955e-05, 4.22159589e-04, 9.99530113e-01],
       [7.91814895e-09, 1.24745553e-04, 3.29338986e-05, 9.99842313e-01],
       [2.90695940e-09, 8.28000218e-05, 1.99307069e-05, 9.99897266e-01],
       [1.37462711e-05, 9.99195078e-01, 2.35215064e-08, 7.91151870e-04],
       [1.50650785e-09, 9.07094310e-05, 1.06795005e-05, 9.99898610e-01]])

In [21]:
# Read submission for structure
test_sub = pd.read_csv('../../data/raw/sample_submission.csv', index_col=0)

sub_df = pd.DataFrame(test_preds, columns=list(test_sub), index=test_sub.index)

sub_df.head()

Unnamed: 0_level_0,no_financial_services,other_only,mm_only,mm_plus
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2352,6.638843e-08,4.8e-05,0.0004221596,0.99953
8208,7.918149e-09,0.000125,3.29339e-05,0.999842
2785,2.906959e-09,8.3e-05,1.993071e-05,0.999897
2967,1.374627e-05,0.999195,2.352151e-08,0.000791
1697,1.506508e-09,9.1e-05,1.06795e-05,0.999899


In [22]:
sub_df.to_csv('../../data/submissions/wild_shot_2.csv')

## Hack - Get from individual flags to desired class

[THIS DID NOT WORK!]

### Manually Create Classification Labels from Individual Probabilities

In [69]:
from sklearn.preprocessing import MinMaxScaler

no_fs = test_probs.sum(axis=1)

other_only = test_probs[:, 1:].sum(axis=1)

mm_only = test_probs[:, 0] + (1-test_probs[:, 1:].mean(axis=1))

mm_plus = test_probs[:, 0] + test_probs[:, 1:].mean(axis=1)

### Make submission

In [77]:
# Read submission for structure
test_sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)

def minmax_scale(arr):
    return MinMaxScaler().fit_transform(arr.reshape(-1, 1)).flatten()

probs_df = pd.DataFrame(columns=list(test_sub))
probs_df['no_financial_services'] = minmax_scale(no_fs)
probs_df['other_only'] = minmax_scale(no_fs)
probs_df['mm_only'] = minmax_scale(mm_only)
probs_df['mm_plus'] = minmax_scale(mm_plus)

probs_df.index = test_sub.index

In [78]:
probs_df.head()

Unnamed: 0_level_0,no_financial_services,other_only,mm_only,mm_plus
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2352,0.520162,0.520162,0.825414,0.692043
8208,0.604957,0.604957,0.680328,0.720478
2785,0.651357,0.651357,0.705072,0.774446
2967,0.383657,0.383657,0.025442,0.265074
1697,0.680221,0.680221,0.685475,0.795112


In [79]:
probs_df.to_csv('../data/submissions/wild_shot.csv')

**Result**

I was bad...