# TPOT Best Model

Using the best model selected by TPOT, fine tune and perform some feature engineering


In [75]:
import numpy as np
import pandas as pd

from src.models.tpot import get_best_model
from src.data.utils import extract_targets, load_data

In [76]:
train_data, test_data = load_data()  
train_data.head()

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8_1,Q8_2,Q8_3,...,mm_total_withdrawals,mm_total_deposits,mm_wdraw_dep_ratio,mm_n_closest_10_km,region_distance,region,district_distance,district,ward_distance,ward
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5086,98,2,3,1,1,2,2,0,0,0,...,4.0,6.0,0.666667,12.0,24.63922,Kigoma,24.600008,Kasulu,25.115583,Heru Ushingo
1258,40,1,1,3,5,1,1,1,0,0,...,20.0,40.0,0.5,278.0,24.238709,Zanzibar,24.238709,Zanzibar,2.650153,Magogoni
331,18,2,4,6,3,2,1,0,0,0,...,30.0,36.0,0.833333,781.0,33.490434,Morogoro,1.700766,Morogoro Urban,0.332536,Mafiga
6729,50,1,1,3,1,1,1,0,0,0,...,59.0,67.0,0.880597,61.0,89.724488,Arusha,16.123696,Karatu,5.875602,Mto wa Mbu
8671,34,1,1,1,1,2,1,0,1,0,...,35.0,23.0,1.521739,2.0,40.15018,Rukwa,29.2643,Nkansi,218.027062,Kaliua


### Target Encode region, district and ward

Get the mean of each `mobile_money`, `savings`,  `borrowing` and `insurance` for each region, district and ward from the training set and replace these categories with those values in both training and testing set

In [77]:
def get_means(level):
    """
    Get the target means for each category in 
    training data grouped by level
    """
    grouped = train_data.groupby(level)

    means = grouped.agg({'mobile_money':'mean', 'savings': 'mean', 'borrowing': 'mean', 'insurance': 'mean'})

    # Add the count of people in that area
    means['count'] = grouped.apply(len)
    
    # Rename the columns
    means.columns = [f'{level}_{col}' for col in list(means)]
    
    return means

def merge_means(df, means):
    """
    Merge the train/test data with regional 
    target means and impute missing values
    """
    
    return df.merge(means, how='left', left_on=level, right_index=True).drop(level, axis=1).fillna(means.mean())


In [78]:
means.mean()

ward_mobile_money     0.648527
ward_savings          0.502264
ward_borrowing        0.448394
ward_insurance        0.167489
ward_count           19.435616
dtype: float64

In [79]:
for level in ['region','district','ward']:
    means = get_means(level=level)
    train_data = merge_means(train_data, means)
    test_data = merge_means(test_data, means)

## Train Best TPOT Model

In [80]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import log_loss

In [81]:
model = get_best_model()
model

Pipeline(memory=None,
         steps=[('stackingestimator',
                 StackingEstimator(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                                        init=None,
                                                                        learning_rate=0.001,
                                                                        loss='deviance',
                                                                        max_depth=8,
                                                                        max_features=0.1,
                                                                        max_leaf_nodes=None,
                                                                        min_impurity_decrease=0.0,
                                                                        min_impurity_split=None,
                                                                        min_samples_leaf=7,
                     

In [82]:
X_train, y_train = extract_targets(train_data)
X_test = test_data

X_train.replace({'Yes': 1}, inplace=True)
X_test.replace({'Yes': 1}, inplace=True)

In [85]:
print(np.mean(cv_score))

-0.762440437376498


In [113]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

tpot = TPOTClassifier(
    generations=50, 
    population_size=20, 
    verbosity=2,
    n_jobs=-1, 
    scoring='neg_log_loss'
)
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=1020, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: -0.7564270402642099
Generation 2 - Current best internal CV score: -0.7529371870894669
Generation 3 - Current best internal CV score: -0.747768070610554
Generation 4 - Current best internal CV score: -0.747768070610554
Generation 5 - Current best internal CV score: -0.747768070610554
Generation 6 - Current best internal CV score: -0.747768070610554
Generation 7 - Current best internal CV score: -0.7414832709363521
Generation 8 - Current best internal CV score: -0.7414832709363521
Generation 9 - Current best internal CV score: -0.7414832709363521
Generation 10 - Current best internal CV score: -0.7414832709363521
Generation 11 - Current best internal CV score: -0.7395425508735591
Generation 12 - Current best internal CV score: -0.7395425508735591
Generation 13 - Current best internal CV score: -0.7395425508735591
Generation 14 - Current best internal CV score: -0.7395425508735591
Generation 15 - Current best internal CV score: -0.7395425508

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=50,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=None, scoring='neg_log_loss', subsample=1.0,
               template='RandomTree', use_dask=False, verbosity=2,
               warm_start=False)

In [130]:
tpot.export('tpot_baseline.py')

In [129]:
LogisticRegression(
    VarianceThreshold(
        MaxAbsScaler(
            DecisionTreeClassifier(
                GaussianNB(
                    XGBClassifier(
                        DecisionTreeClassifier(
                            DecisionTreeClassifier(input_matrix, 
                                                   DecisionTreeClassifier__criterion=entropy, 
                                                   DecisionTreeClassifier__max_depth=2, 
                                                   DecisionTreeClassifier__min_samples_leaf=17, 
                                                   DecisionTreeClassifier__min_samples_split=4), 
                            DecisionTreeClassifier__criterion=entropy, 
                            DecisionTreeClassifier__max_depth=2, 
                            DecisionTreeClassifier__min_samples_leaf=19, 
                            DecisionTreeClassifier__min_samples_split=17), 
                        XGBClassifier__learning_rate=0.01, 
                        XGBClassifier__max_depth=2, 
                        XGBClassifier__min_child_weight=12, 
                        XGBClassifier__n_estimators=100, 
                        XGBClassifier__nthread=1, 
                        XGBClassifier__subsample=0.15000000000000002)), 
                DecisionTreeClassifier__criterion=entropy, 
                DecisionTreeClassifier__max_depth=2, 
                DecisionTreeClassifier__min_samples_leaf=19, 
                DecisionTreeClassifier__min_samples_split=17)
        ), 
        VarianceThreshold__threshold=0.005), 
    LogisticRegression__C=1.0, 
    LogisticRegression__dual=False, 
    LogisticRegression__penalty=l1)

NameError: name 'VarianceThreshold' is not defined

## Get top 10 estimators

In [128]:
estimator_tuples = [(est, info['internal_cv_score']) for est, info in tpot.evaluated_individuals_.items()]
estimator_tuples = sorted(estimator_tuples, key=lambda x: x[1], reverse=True)
top_10_estimators = estimator_tuples[:10]

[('LogisticRegression(VarianceThreshold(MaxAbsScaler(DecisionTreeClassifier(GaussianNB(XGBClassifier(DecisionTreeClassifier(DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=2, DecisionTreeClassifier__min_samples_leaf=17, DecisionTreeClassifier__min_samples_split=4), DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=2, DecisionTreeClassifier__min_samples_leaf=19, DecisionTreeClassifier__min_samples_split=17), XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=12, XGBClassifier__n_estimators=100, XGBClassifier__nthread=1, XGBClassifier__subsample=0.15000000000000002)), DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=2, DecisionTreeClassifier__min_samples_leaf=19, DecisionTreeClassifier__min_samples_split=17)), VarianceThreshold__threshold=0.005), LogisticRegression__C=1.0, LogisticRegression__dual=False, LogisticRegression__p

In [131]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier

# Average CV score on the training set was:-0.7326581163228116
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=17, min_samples_split=4)),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=19, min_samples_split=17)),
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.01, max_depth=2, min_child_weight=12, n_estimators=100, nthread=1, subsample=0.15000000000000002)),
    StackingEstimator(estimator=GaussianNB()),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=19, min_samples_split=17)),
    MaxAbsScaler(),
    VarianceThreshold(threshold=0.005),
    LogisticRegression(C=1.0, dual=False, penalty="l1")
)

In [132]:
cv_score = cross_val_score(best_model, 
                           X_train,
                           y_train, 
                           cv=StratifiedKFold(random_state=420, shuffle=True), 
                           scoring='neg_log_loss')
print(cv_score)



[-0.74571421 -0.75856724 -0.75915338]




In [133]:
best_model.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('stackingestimator-1',
                 StackingEstimator(estimator=BernoulliNB(alpha=1.0,
                                                         binarize=0.0,
                                                         class_prior=None,
                                                         fit_prior=True))),
                ('maxabsscaler-1', MaxAbsScaler(copy=True)),
                ('selectpercentile',
                 SelectPercentile(percentile=92,
                                  score_func=<function f_classif at 0x11e98c1e0>)),...
                                                                        validation_fraction=0.1,
                                                                        verbose=0,
                                                                        warm_start=False))),
                ('maxabsscaler-2', MaxAbsScaler(copy=True)),

In [138]:
from sklearn.model_selection import cross_val_predict

cv_preds = cross_val_predict(best_model, X_train, y_train)



In [None]:
from skl

## Make Submission

In [134]:
from src.utils import make_sub

In [135]:
probs = model.predict_proba(X_test)
sub_df = make_sub(probs)
sub_df.head()

Unnamed: 0,no_financial_services,other_only,mm_only,mm_plus
2352,0.000317,0.001405,0.042567,0.955711
8208,0.050109,0.040072,0.104052,0.805766
2785,0.004545,0.022643,0.081706,0.891106
2967,0.32254,0.626212,0.01994,0.031308
1697,0.000566,0.003893,0.149355,0.846186


In [136]:
sub_df.to_csv('../../data/submissions/tpot_target_encoding.csv')