# Dependencies 

In [1]:
import pandas as pd
import re
import sys

from typing import List
from catboost import CatBoostClassifier
from collections import namedtuple, Counter
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from category_encoders.woe import WOEEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.helmert import HelmertEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

sys.path.append('..')

from customer_segmentation.constants import PATH_MODELS, RANDOM_STATE, PATH_SUBMISSIONS
from customer_segmentation.experiment_tracking import Tracking
from customer_segmentation import (read_demographic_data,
                                   cat_features_fillna,
                                   compute_metrics,
                                   load_catboost_model,
                                   load_pipeline,
                                   serialize_object_load,
                                   new_experiment,
                                   new_run,
                                   apply_runs_to_experiment,
                                   preprocessing_baseline,
                                   show_metrics_baseline,
                                   kaggle_submission,
                                   n_best_models_from_experiments,
                                   load_trained_model)

# Notebook Specific Helper Functions 

In [2]:
def reformat_data_to_model(df: pd.DataFrame,
                           features_used: List[str],
                           dtypes: pd.Series = None) -> pd.DataFrame:
    """Apply `features used` to `df`, fill the NA values with function
    `cat_features fillna` and apply `dtypes` when necessary
    """
    df_copy = df[features_used].copy()

    cat_features = df_copy.select_dtypes(include=['category', 'object']).columns
    
    df_copy_transformed = cat_features_fillna(
        df_copy,
        cat_features=cat_features
    )
    
    if dtypes is not None:
        df_copy_transformed = df_copy_transformed.astype(dtypes)
    
    return df_copy_transformed


def show_sample(df: pd.DataFrame, n: int = 10) -> None:
    """Displays a sample of `n`, with all columns"""
    with pd.option_context('display.max_columns', None):
        display(df.sample(n).style.hide_index().highlight_null())


def adjust_column_names_for_feature_encoding(columns: List[str]):
    """Correct the column names to avoid collisions
    by adding a _number as a suffix.
    
    Translate numbers into letters, with A being 0,
    B being 1 and so on.
    
    chr(65) -> A
    ord(A) -> 65
    """
    return [column[:-1] + chr(int(re.search('_(\\d)$', column).group(1)) + 65) if re.search('_\\d$', column)
            else column
            for column in columns]

# Constants 

In [3]:
TARGET = 'RESPONSE'
TARGET_CUSTOMER = 'is_customer'

# Read Train And Test Data 

In [4]:
df_mailout_train = read_demographic_data('mailout_train.csv')
df_mailout_test = read_demographic_data('mailout_test.csv')

# Read Customers And German Populations Samples (20%) 

In [5]:
df_customers = read_demographic_data('demographic_data_customers_sample.csv')
df_german_population = read_demographic_data('demographic_data_german_population_sample.csv')

# Load Saved  Models

When we show metrics in local there were different of the ones in `kaggle`

This can be because we are training in different sets and it would be good to check if metrics in training set behaves like the submissions  

In [6]:
catboost_first_model = load_catboost_model('catboost_first_model.cbm')
catboost_less_features_model = load_catboost_model('catboost_less_features_model.cbm')
catboost_with_clusters_model = load_catboost_model('catboost_with_clusters_model.cbm')

X_train_dtypes = serialize_object_load('X_train_dtypes.pkl')
pipeline_kmeans = load_pipeline('pipeline_kmeans.joblib')

# Reformat Data For Each Model

In [7]:
df_mailout_train_first_model_transformed = reformat_data_to_model(
    df_mailout_train,
    features_used=catboost_first_model.feature_names_
)

df_mailout_train_less_features_model_transformed = reformat_data_to_model(
    df_mailout_train,
    features_used=catboost_less_features_model.feature_names_
)

df_mailout_train_with_clusters_transformed = reformat_data_to_model(
    df_mailout_train,
    features_used=catboost_first_model.feature_names_,
    dtypes=X_train_dtypes
)

cluster_colnames = ['cluster_0', 'custer_1', 'cluster_2']

df_mailout_train_with_clusters_transformed[cluster_colnames] = (
    pd.DataFrame(pipeline_kmeans.transform(df_mailout_train_with_clusters_transformed))
)

# Apply Models 

In [8]:
df_mailout_train['score_first_model'] = (
    catboost_first_model
    .predict_proba(df_mailout_train_first_model_transformed)[:, 1]
)

df_mailout_train['score_less_features_model'] = (
    catboost_less_features_model
    .predict_proba(df_mailout_train_less_features_model_transformed)[:, 1]
)

df_mailout_train['score_with_clusters_model'] = (
    catboost_with_clusters_model
    .predict_proba(df_mailout_train_with_clusters_transformed)[:, 1]
)

# Metrics For Each Model 

In [9]:
y_true = df_mailout_train['RESPONSE'].fillna(0)

auc_first_model = roc_auc_score(y_true, df_mailout_train["score_first_model"])
auc_less_features_model = roc_auc_score(y_true, df_mailout_train["score_less_features_model"])
auc_with_clusters_model = roc_auc_score(y_true, df_mailout_train["score_with_clusters_model"])

print(f'AUC First Model: {auc_first_model}')
print(f'AUC Less Features Model: {auc_less_features_model}')
print(f'AUC With Clusters Model: {auc_with_clusters_model}')

AUC First Model: 0.7533230096806949
AUC Less Features Model: 0.7520571254910786
AUC With Clusters Model: 0.749795284227538


# Combining The Models

Combining both scopres from `first_model` and `less_features_model` it results in a better model 

In [10]:
auc_combining_models = roc_auc_score(
    y_true, ((df_mailout_train['score_first_model'] + df_mailout_train['score_less_features_model'])
             / 2)
)

print(f'AUC Combining Models: {auc_combining_models}')

AUC Combining Models: 0.753635753890973


# So We Have A Baseline Of 0.75

We will use the columns used by the first model because them are all of the possible used columns since it eliminates the ID variable and some features that are not present when predicting

In [11]:
df_mailout_train_base = (
    df_mailout_train.loc[:, catboost_first_model.feature_names_ + [TARGET]].fillna(value={TARGET: 0})
)

df_mailout_test_base = df_mailout_test.loc[:, catboost_first_model.feature_names_]

# Training Data Sample 

In [12]:
show_sample(df_mailout_train_base)

AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,ARBEIT,BALLRAUM,CAMEO_DEU_2015,CAMEO_DEUG_2015,CAMEO_INTL_2015,CJT_GESAMTTYP,CJT_KATALOGNUTZER,CJT_TYP_1,CJT_TYP_2,CJT_TYP_3,CJT_TYP_4,CJT_TYP_5,CJT_TYP_6,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_OFFLINE_DATUM,D19_BANKEN_ONLINE_DATUM,D19_BANKEN_ONLINE_QUOTE_12,D19_BANKEN_REST,D19_BEKLEIDUNG_GEH,D19_BEKLEIDUNG_REST,D19_BILDUNG,D19_BIO_OEKO,D19_BUCH_CD,D19_DIGIT_SERV,D19_DROGERIEARTIKEL,D19_ENERGIE,D19_FREIZEIT,D19_GARTEN,D19_GESAMT_ANZ_12,D19_GESAMT_ANZ_24,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,D19_GESAMT_ONLINE_QUOTE_12,D19_HANDWERK,D19_HAUS_DEKO,D19_KINDERARTIKEL,D19_KONSUMTYP,D19_KONSUMTYP_MAX,D19_KOSMETIK,D19_LEBENSMITTEL,D19_LETZTER_KAUF_BRANCHE,D19_LOTTO,D19_NAHRUNGSERGAENZUNG,D19_RATGEBER,D19_REISEN,D19_SAMMELARTIKEL,D19_SCHUHE,D19_SONSTIGE,D19_SOZIALES,D19_TECHNIK,D19_TELKO_ANZ_12,D19_TELKO_ANZ_24,D19_TELKO_DATUM,D19_TELKO_MOBILE,D19_TELKO_OFFLINE_DATUM,D19_TELKO_ONLINE_DATUM,D19_TELKO_ONLINE_QUOTE_12,D19_TELKO_REST,D19_TIERARTIKEL,D19_VERSAND_ANZ_12,D19_VERSAND_ANZ_24,D19_VERSAND_DATUM,D19_VERSAND_OFFLINE_DATUM,D19_VERSAND_ONLINE_DATUM,D19_VERSAND_ONLINE_QUOTE_12,D19_VERSAND_REST,D19_VERSI_ANZ_12,D19_VERSI_ANZ_24,D19_VERSI_DATUM,D19_VERSI_OFFLINE_DATUM,D19_VERSI_ONLINE_DATUM,D19_VERSI_ONLINE_QUOTE_12,D19_VERSICHERUNGEN,D19_VOLLSORTIMENT,D19_WEIN_FEINKOST,DSL_FLAG,EINGEFUEGT_AM,EINGEZOGENAM_HH_JAHR,EWDICHTE,EXTSEL992,FINANZ_ANLEGER,FINANZ_HAUSBAUER,FINANZ_MINIMALIST,FINANZ_SPARER,FINANZ_UNAUFFAELLIGER,FINANZ_VORSORGER,FINANZTYP,FIRMENDICHTE,GEBAEUDETYP,GEBAEUDETYP_RASTER,GEBURTSJAHR,GEMEINDETYP,GFK_URLAUBERTYP,GREEN_AVANTGARDE,HEALTH_TYP,HH_DELTA_FLAG,HH_EINKOMMEN_SCORE,INNENSTADT,KBA05_ALTER1,KBA05_ALTER2,KBA05_ALTER3,KBA05_ALTER4,KBA05_ANHANG,KBA05_ANTG1,KBA05_ANTG2,KBA05_ANTG3,KBA05_ANTG4,KBA05_AUTOQUOT,KBA05_BAUMAX,KBA05_CCM1,KBA05_CCM2,KBA05_CCM3,KBA05_CCM4,KBA05_DIESEL,KBA05_FRAU,KBA05_GBZ,KBA05_HERST1,KBA05_HERST2,KBA05_HERST3,KBA05_HERST4,KBA05_HERST5,KBA05_HERSTTEMP,KBA05_KRSAQUOT,KBA05_KRSHERST1,KBA05_KRSHERST2,KBA05_KRSHERST3,KBA05_KRSKLEIN,KBA05_KRSOBER,KBA05_KRSVAN,KBA05_KRSZUL,KBA05_KW1,KBA05_KW2,KBA05_KW3,KBA05_MAXAH,KBA05_MAXBJ,KBA05_MAXHERST,KBA05_MAXSEG,KBA05_MAXVORB,KBA05_MOD1,KBA05_MOD2,KBA05_MOD3,KBA05_MOD4,KBA05_MOD8,KBA05_MODTEMP,KBA05_MOTOR,KBA05_MOTRAD,KBA05_SEG1,KBA05_SEG10,KBA05_SEG2,KBA05_SEG3,KBA05_SEG4,KBA05_SEG5,KBA05_SEG6,KBA05_SEG7,KBA05_SEG8,KBA05_SEG9,KBA05_VORB0,KBA05_VORB1,KBA05_VORB2,KBA05_ZUL1,KBA05_ZUL2,KBA05_ZUL3,KBA05_ZUL4,KBA13_ALTERHALTER_30,KBA13_ALTERHALTER_45,KBA13_ALTERHALTER_60,KBA13_ALTERHALTER_61,KBA13_ANTG1,KBA13_ANTG2,KBA13_ANTG3,KBA13_ANTG4,KBA13_ANZAHL_PKW,KBA13_AUDI,KBA13_AUTOQUOTE,KBA13_BAUMAX,KBA13_BJ_1999,KBA13_BJ_2000,KBA13_BJ_2004,KBA13_BJ_2006,KBA13_BJ_2008,KBA13_BJ_2009,KBA13_BMW,KBA13_CCM_0_1400,KBA13_CCM_1000,KBA13_CCM_1200,KBA13_CCM_1400,KBA13_CCM_1401_2500,KBA13_CCM_1500,KBA13_CCM_1600,KBA13_CCM_1800,KBA13_CCM_2000,KBA13_CCM_2500,KBA13_CCM_2501,KBA13_CCM_3000,KBA13_CCM_3001,KBA13_FAB_ASIEN,KBA13_FAB_SONSTIGE,KBA13_FIAT,KBA13_FORD,KBA13_GBZ,KBA13_HALTER_20,KBA13_HALTER_25,KBA13_HALTER_30,KBA13_HALTER_35,KBA13_HALTER_40,KBA13_HALTER_45,KBA13_HALTER_50,KBA13_HALTER_55,KBA13_HALTER_60,KBA13_HALTER_65,KBA13_HALTER_66,KBA13_HERST_ASIEN,KBA13_HERST_AUDI_VW,KBA13_HERST_BMW_BENZ,KBA13_HERST_EUROPA,KBA13_HERST_FORD_OPEL,KBA13_HERST_SONST,KBA13_HHZ,KBA13_KMH_0_140,KBA13_KMH_110,KBA13_KMH_140,KBA13_KMH_140_210,KBA13_KMH_180,KBA13_KMH_210,KBA13_KMH_211,KBA13_KMH_250,KBA13_KMH_251,KBA13_KRSAQUOT,KBA13_KRSHERST_AUDI_VW,KBA13_KRSHERST_BMW_BENZ,KBA13_KRSHERST_FORD_OPEL,KBA13_KRSSEG_KLEIN,KBA13_KRSSEG_OBER,KBA13_KRSSEG_VAN,KBA13_KRSZUL_NEU,KBA13_KW_0_60,KBA13_KW_110,KBA13_KW_120,KBA13_KW_121,KBA13_KW_30,KBA13_KW_40,KBA13_KW_50,KBA13_KW_60,KBA13_KW_61_120,KBA13_KW_70,KBA13_KW_80,KBA13_KW_90,KBA13_MAZDA,KBA13_MERCEDES,KBA13_MOTOR,KBA13_NISSAN,KBA13_OPEL,KBA13_PEUGEOT,KBA13_RENAULT,KBA13_SEG_GELAENDEWAGEN,KBA13_SEG_GROSSRAUMVANS,KBA13_SEG_KLEINST,KBA13_SEG_KLEINWAGEN,KBA13_SEG_KOMPAKTKLASSE,KBA13_SEG_MINIVANS,KBA13_SEG_MINIWAGEN,KBA13_SEG_MITTELKLASSE,KBA13_SEG_OBEREMITTELKLASSE,KBA13_SEG_OBERKLASSE,KBA13_SEG_SONSTIGE,KBA13_SEG_SPORTWAGEN,KBA13_SEG_UTILITIES,KBA13_SEG_VAN,KBA13_SEG_WOHNMOBILE,KBA13_SITZE_4,KBA13_SITZE_5,KBA13_SITZE_6,KBA13_TOYOTA,KBA13_VORB_0,KBA13_VORB_1,KBA13_VORB_1_2,KBA13_VORB_2,KBA13_VORB_3,KBA13_VW,KK_KUNDENTYP,KKK,KOMBIALTER,KONSUMNAEHE,KONSUMZELLE,LP_FAMILIE_FEIN,LP_FAMILIE_GROB,LP_LEBENSPHASE_FEIN,LP_LEBENSPHASE_GROB,LP_STATUS_FEIN,LP_STATUS_GROB,MIN_GEBAEUDEJAHR,MOBI_RASTER,MOBI_REGIO,NATIONALITAET_KZ,ONLINE_AFFINITAET,ORTSGR_KLS9,OST_WEST_KZ,PLZ8_ANTG1,PLZ8_ANTG2,PLZ8_ANTG3,PLZ8_ANTG4,PLZ8_BAUMAX,PLZ8_GBZ,PLZ8_HHZ,PRAEGENDE_JUGENDJAHRE,REGIOTYP,RELAT_AB,RETOURTYP_BK_S,RT_KEIN_ANREIZ,RT_SCHNAEPPCHEN,RT_UEBERGROESSE,SEMIO_DOM,SEMIO_ERL,SEMIO_FAM,SEMIO_KAEM,SEMIO_KRIT,SEMIO_KULT,SEMIO_LUST,SEMIO_MAT,SEMIO_PFLICHT,SEMIO_RAT,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,SOHO_KZ,STRUKTURTYP,TITEL_KZ,UMFELD_ALT,UMFELD_JUNG,UNGLEICHENN_FLAG,VERDICHTUNGSRAUM,VERS_TYP,VHA,VHN,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB,RESPONSE
,,,,,,,,,,,,,,,,,,,6,5,5,5,5,5,5,5,,,10,,,,10,10,,,,,,,,,,,,,,,10,10,10,,,,,,9,,,,,,,,,,,,,,,10,,10,10,,,,,,10,10,10,,,,,10,10,10,,,,,,,,,,5,3,3,4,5,3,4,,,,,,5,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9,,,,,,,5,2,,,,,2,,,,,,,,,,,,,3,4,5,,6,3,6,6,7,3,5,5,5,4,7,2,3,1,,,,,,,,,,,,,,,,,,3,1,1,0
2.0,1.0,9.0,,,,,9.0,5.0,,,1.0,6.0,,4.0,7.0,7C,7.0,54.0,2,5,2,1,5,5,5,5,,,9,6.0,,,10,9,,,,7.0,,,,,,,,,,,9,9,9,,,,,9.0,8,,,D19_UNBEKANNT,,,,,,,7.0,,7.0,,,10,,10,10,,,,,,9,9,10,,,,,10,10,10,,,6.0,,1.0,1992-02-10 00:00:00,1997.0,4.0,,2,5,3,1,1,4,2,4.0,1.0,4.0,,12.0,3,,1.0,,5,3.0,,2.0,3.0,5.0,1.0,,3.0,3.0,,2.0,3.0,1.0,4.0,4.0,,,1.0,2.0,4.0,4.0,1.0,1.0,4.0,1.0,3.0,4.0,5.0,1.0,1.0,2.0,2.0,1.0,1.0,5.0,,5.0,3.0,2.0,3.0,1.0,,3.0,4.0,2.0,2.0,1.0,3.0,1.0,1.0,3.0,2.0,3.0,5.0,,,3.0,,1.0,5.0,3.0,,3.0,4.0,3.0,1.0,2.0,1.0,2.0,5.0,1.0,4.0,3.0,1.0,296.0,3.0,2.0,3.0,2.0,3.0,3.0,4.0,,1.0,3.0,4.0,5.0,,5.0,3.0,3.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,5.0,5.0,5.0,5.0,1.0,2.0,1.0,3.0,4.0,3.0,3.0,1.0,4.0,4.0,5.0,1.0,,,1.0,3.0,1.0,4.0,4.0,2.0,2.0,2.0,,5.0,,1.0,1.0,1.0,5.0,3.0,5.0,1.0,2.0,,3.0,5.0,3.0,3.0,4.0,4.0,3.0,1.0,2.0,2.0,2.0,3.0,4.0,3.0,1.0,4.0,3.0,1.0,5.0,1.0,2.0,2.0,4.0,3.0,4.0,2.0,5.0,3.0,4.0,3.0,2.0,2.0,1.0,,3.0,4,3.0,,1.0,1.0,6.0,2.0,1,1,1992.0,1.0,2.0,1.0,1,7.0,O,1.0,4.0,3.0,1.0,3.0,2.0,3.0,3.0,6.0,3.0,5,3,4,1.0,5,7,3,6,6,1,7,1,1,3,1,1,1,1,,,3.0,,1.0,5.0,,26.0,2.0,,2.0,10.0,10.0,7.0,6.0,9.0,4.0,3,2,4,0
,1.0,14.0,,,,,14.0,2.0,,,2.0,3.0,,3.0,6.0,3D,3.0,25.0,2,5,2,1,5,4,5,5,,,10,,,,10,10,,,,,,,,,,,,,2.0,2.0,3,10,10,,,,,4.0,1,,,D19_VERSICHERUNGEN,,,,,,,,,,,,10,,10,10,,,,,,10,10,10,,,2.0,2.0,3,10,10,,1.0,6.0,,1.0,1992-02-10 00:00:00,1997.0,1.0,,3,2,4,2,2,3,6,5.0,1.0,5.0,,30.0,12,1.0,1.0,1.0,2,8.0,2.0,4.0,3.0,1.0,1.0,3.0,2.0,,,3.0,1.0,4.0,3.0,2.0,1.0,2.0,4.0,5.0,1.0,3.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0,4.0,2.0,3.0,2.0,2.0,4.0,3.0,1.0,2.0,1.0,3.0,1.0,2.0,1.0,3.0,2.0,4.0,,4.0,1.0,2.0,1.0,1.0,4.0,3.0,2.0,2.0,,1.0,,1.0,3.0,3.0,2.0,4.0,1.0,4.0,3.0,5.0,3.0,3.0,3.0,3.0,2.0,1.0,,695.0,3.0,3.0,1.0,3.0,3.0,4.0,4.0,3.0,,2.0,2.0,2.0,,3.0,3.0,2.0,3.0,,4.0,3.0,3.0,3.0,4.0,5.0,4.0,3.0,3.0,5.0,5.0,5.0,4.0,4.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,4.0,4.0,,1.0,2.0,3.0,2.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,3.0,2.0,2.0,2.0,4.0,3.0,3.0,1.0,2.0,2.0,3.0,4.0,2.0,,3.0,3.0,1.0,3.0,3.0,3.0,3.0,4.0,4.0,2.0,4.0,4.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,1.0,5.0,3.0,2.0,2.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,3,3.0,,1.0,1.0,11.0,3.0,9,4,1992.0,4.0,4.0,1.0,3,4.0,O,3.0,2.0,1.0,,1.0,4.0,4.0,9.0,2.0,1.0,3,2,4,5.0,3,3,6,1,3,6,7,6,2,3,5,3,4,6,,,1.0,,5.0,5.0,,,2.0,,1.0,8.0,10.0,7.0,4.0,9.0,7.0,1,1,4,0
2.0,1.0,19.0,,,,,13.0,1.0,,,3.0,2.0,,1.0,4.0,2D,2.0,14.0,3,1,2,3,4,4,5,3,1.0,1.0,2,3.0,,,10,2,10.0,,,,,,,,,,,,3.0,3.0,2,2,2,7.0,6.0,,,2.0,3,,,D19_SONSTIGE,,,,6.0,,,3.0,3.0,6.0,,,9,,10,10,,6.0,,3.0,3.0,5,10,5,5.0,,,,9,9,10,,6.0,1.0,,1.0,1992-02-10 00:00:00,1994.0,3.0,6.0,2,2,5,1,3,5,2,4.0,3.0,4.0,1958.0,50.0,5,1.0,2.0,,1,5.0,1.0,3.0,3.0,4.0,1.0,4.0,1.0,,,4.0,1.0,1.0,2.0,5.0,4.0,4.0,1.0,3.0,5.0,4.0,2.0,2.0,2.0,1.0,3.0,5.0,4.0,2.0,1.0,3.0,3.0,3.0,1.0,4.0,4.0,3.0,4.0,1.0,4.0,1.0,4.0,5.0,1.0,,3.0,2.0,4.0,3.0,,4.0,1.0,1.0,4.0,4.0,1.0,3.0,1.0,3.0,5.0,2.0,,2.0,2.0,4.0,5.0,2.0,3.0,4.0,2.0,3.0,3.0,1.0,,1500.0,4.0,4.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,3.0,3.0,3.0,,3.0,3.0,3.0,3.0,3.0,2.0,3.0,5.0,3.0,5.0,4.0,2.0,2.0,3.0,3.0,4.0,5.0,3.0,4.0,3.0,2.0,2.0,4.0,3.0,4.0,3.0,3.0,5.0,4.0,2.0,4.0,3.0,3.0,2.0,3.0,3.0,1.0,4.0,5.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,4.0,5.0,2.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,,3.0,3.0,5.0,4.0,3.0,4.0,2.0,4.0,3.0,3.0,4.0,4.0,3.0,,4.0,2.0,3.0,4,5.0,,11.0,5.0,39.0,12.0,10,5,1992.0,3.0,5.0,1.0,4,2.0,W,3.0,3.0,1.0,,1.0,5.0,5.0,9.0,4.0,1.0,5,1,4,4.0,6,6,1,5,7,3,4,2,4,4,1,5,2,3,3.0,,2.0,,5.0,2.0,,,1.0,,1.0,3.0,4.0,2.0,3.0,9.0,7.0,1,2,3,0
2.0,1.0,9.0,,,,,9.0,4.0,,,1.0,4.0,,4.0,1.0,7B,7.0,41.0,3,5,2,2,5,5,5,4,,,10,,,,10,10,,,5.0,6.0,6.0,,6.0,,,,,,1.0,2.0,3,6,3,10.0,6.0,,,3.0,2,,,D19_VOLLSORTIMENT,,,,,,,6.0,1.0,6.0,,,10,,10,10,,,,1.0,2.0,3,6,3,10.0,,,,10,10,10,,,3.0,,1.0,1992-02-12 00:00:00,1994.0,6.0,55.0,1,5,2,1,1,5,5,1.0,3.0,3.0,1936.0,11.0,10,1.0,2.0,,5,2.0,2.0,2.0,3.0,5.0,,1.0,2.0,,,2.0,5.0,1.0,3.0,3.0,4.0,1.0,2.0,3.0,5.0,4.0,2.0,,,1.0,3.0,5.0,5.0,1.0,1.0,3.0,2.0,1.0,3.0,2.0,4.0,5.0,1.0,1.0,4.0,2.0,4.0,3.0,1.0,2.0,2.0,1.0,4.0,1.0,,2.0,1.0,2.0,2.0,3.0,,2.0,,,3.0,2.0,3.0,4.0,1.0,5.0,,4.0,5.0,1.0,3.0,1.0,4.0,3.0,1.0,224.0,4.0,1.0,5.0,2.0,3.0,2.0,3.0,3.0,3.0,5.0,,3.0,,3.0,3.0,4.0,2.0,,3.0,4.0,4.0,3.0,5.0,2.0,3.0,4.0,5.0,2.0,1.0,3.0,5.0,5.0,5.0,4.0,1.0,1.0,1.0,3.0,3.0,2.0,2.0,5.0,4.0,3.0,3.0,3.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0,4.0,5.0,2.0,2.0,2.0,1.0,2.0,3.0,4.0,4.0,3.0,1.0,,2.0,4.0,3.0,3.0,2.0,,2.0,4.0,2.0,3.0,2.0,4.0,3.0,2.0,1.0,5.0,4.0,4.0,1.0,5.0,1.0,4.0,5.0,1.0,5.0,4.0,1.0,1.0,5.0,1.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,1.0,3.0,2.0,4,1.0,1.0,1.0,1.0,8.0,2.0,3,2,1992.0,1.0,3.0,1.0,1,9.0,W,1.0,4.0,3.0,1.0,5.0,2.0,3.0,4.0,3.0,4.0,5,2,5,1.0,5,7,3,6,7,1,5,5,1,2,2,5,1,4,1.0,,3.0,,4.0,4.0,,1.0,2.0,1.0,4.0,3.0,2.0,1.0,6.0,9.0,3.0,3,2,4,0
3.0,1.0,10.0,,,,,9.0,3.0,,,2.0,2.0,,3.0,1.0,2D,2.0,14.0,2,4,1,2,5,5,5,5,,,10,,,,10,10,,,,3.0,,,,,,,,,2.0,2.0,2,9,2,10.0,,,,2.0,1,,,D19_BEKLEIDUNG_REST,,,,,,,7.0,3.0,7.0,1.0,1.0,5,,10,10,,3.0,,1.0,1.0,2,9,2,10.0,,,1.0,6,10,10,,5.0,6.0,,1.0,1995-08-10 00:00:00,2004.0,5.0,55.0,1,5,4,1,2,4,5,3.0,8.0,3.0,1939.0,22.0,3,,1.0,,4,4.0,1.0,4.0,3.0,3.0,3.0,2.0,2.0,,,4.0,,4.0,3.0,2.0,1.0,1.0,4.0,4.0,2.0,3.0,2.0,4.0,3.0,3.0,4.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,,2.0,2.0,4.0,1.0,1.0,2.0,1.0,3.0,3.0,,4.0,1.0,1.0,2.0,2.0,4.0,3.0,3.0,1.0,,2.0,1.0,,4.0,3.0,2.0,2.0,4.0,2.0,4.0,3.0,4.0,2.0,4.0,3.0,1.0,1.0,,1250.0,4.0,3.0,1.0,2.0,1.0,2.0,3.0,4.0,4.0,4.0,1.0,2.0,2.0,3.0,4.0,3.0,3.0,,4.0,3.0,,3.0,4.0,2.0,4.0,2.0,3.0,5.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,4.0,3.0,4.0,3.0,3.0,2.0,4.0,5.0,3.0,2.0,3.0,3.0,2.0,4.0,3.0,3.0,1.0,3.0,4.0,5.0,1.0,2.0,3.0,3.0,3.0,2.0,4.0,4.0,3.0,2.0,2.0,2.0,3.0,4.0,3.0,,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,4.0,3.0,,3.0,3.0,2.0,2.0,4.0,3.0,3.0,5.0,2.0,1.0,2.0,2.0,3.0,3.0,1.0,4,4.0,,10.0,5.0,31.0,10.0,1,1,1995.0,3.0,4.0,1.0,3,5.0,O,3.0,2.0,1.0,,1.0,5.0,5.0,3.0,1.0,1.0,5,2,5,1.0,2,4,5,3,5,5,6,6,3,5,4,3,4,7,1.0,,3.0,,4.0,5.0,,10.0,2.0,5.0,4.0,1.0,1.0,1.0,4.0,9.0,3.0,1,1,3,0
2.0,1.0,,,,,,9.0,2.0,,,4.0,2.0,,3.0,6.0,3D,3.0,25.0,5,4,1,1,5,5,5,5,2.0,2.0,4,3.0,3.0,,10,4,10.0,,3.0,3.0,,,,5.0,3.0,2.0,6.0,,5.0,6.0,1,1,1,9.0,,2.0,3.0,1.0,1,3.0,,D19_HAUS_DEKO,7.0,,,7.0,,2.0,6.0,,,2.0,2.0,4,2.0,10,10,,,3.0,5.0,5.0,1,1,1,8.0,,2.0,3.0,2,10,10,,2.0,3.0,,1.0,1992-02-12 00:00:00,1999.0,1.0,56.0,1,3,5,1,1,5,6,5.0,1.0,5.0,1938.0,30.0,3,,1.0,1.0,4,7.0,,1.0,4.0,5.0,1.0,4.0,,,,3.0,1.0,2.0,5.0,3.0,1.0,2.0,3.0,4.0,2.0,3.0,4.0,1.0,4.0,4.0,3.0,3.0,2.0,4.0,1.0,1.0,3.0,2.0,2.0,4.0,1.0,5.0,2.0,3.0,2.0,1.0,,3.0,5.0,,3.0,3.0,2.0,3.0,1.0,3.0,1.0,4.0,3.0,,,1.0,,3.0,5.0,2.0,1.0,3.0,4.0,1.0,2.0,3.0,3.0,4.0,2.0,2.0,3.0,1.0,,581.0,4.0,3.0,1.0,1.0,1.0,2.0,3.0,5.0,5.0,3.0,3.0,3.0,3.0,2.0,3.0,1.0,2.0,4.0,4.0,3.0,3.0,3.0,4.0,2.0,4.0,2.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,2.0,3.0,4.0,3.0,2.0,2.0,3.0,4.0,3.0,3.0,1.0,4.0,2.0,1.0,4.0,4.0,4.0,1.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,3.0,2.0,3.0,5.0,3.0,1.0,,4.0,,4.0,,3.0,3.0,2.0,2.0,3.0,3.0,2.0,4.0,1.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0,5.0,5.0,3.0,2.0,1.0,2.0,3.0,1.0,2.0,4,3.0,,11.0,5.0,38.0,12.0,9,4,1992.0,2.0,5.0,1.0,5,4.0,W,2.0,3.0,2.0,1.0,1.0,3.0,3.0,3.0,4.0,1.0,4,4,5,1.0,3,7,5,2,3,4,7,4,1,1,3,3,1,7,3.0,,2.0,,2.0,5.0,1.0,,1.0,,1.0,1.0,1.0,1.0,2.0,9.0,3.0,3,1,4,0
,1.0,21.0,,,,,14.0,1.0,,,4.0,2.0,,3.0,5.0,4A,4.0,22.0,4,1,3,2,4,2,3,2,,1.0,7,6.0,6.0,,8,10,,5.0,,3.0,6.0,,6.0,6.0,,,,,2.0,3.0,2,8,2,10.0,6.0,6.0,6.0,1.0,4,6.0,,D19_SCHUHE,7.0,,6.0,6.0,6.0,3.0,6.0,1.0,6.0,,,8,6.0,8,10,,6.0,,2.0,2.0,2,8,2,10.0,5.0,,,8,8,10,,6.0,6.0,,1.0,1992-02-12 00:00:00,1994.0,3.0,48.0,2,1,5,2,2,4,6,4.0,1.0,4.0,1963.0,22.0,3,1.0,2.0,,2,5.0,2.0,2.0,3.0,4.0,3.0,4.0,1.0,,,5.0,1.0,2.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,3.0,4.0,3.0,1.0,3.0,5.0,3.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,5.0,3.0,3.0,2.0,1.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,1.0,,2.0,2.0,3.0,3.0,3.0,,2.0,1.0,1.0,4.0,3.0,3.0,3.0,4.0,2.0,2.0,3.0,3.0,4.0,2.0,2.0,3.0,1.0,,1300.0,3.0,3.0,1.0,3.0,3.0,4.0,4.0,2.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,1.0,5.0,3.0,5.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,2.0,3.0,2.0,3.0,5.0,5.0,1.0,5.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,3.0,2.0,3.0,4.0,2.0,2.0,1.0,2.0,3.0,,3.0,3.0,2.0,2.0,,3.0,3.0,,3.0,3.0,3.0,3.0,4.0,2.0,5.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,2.0,3.0,4.0,4.0,3.0,,3.0,3.0,2.0,3,2.0,,5.0,3.0,23.0,6.0,10,5,1992.0,4.0,3.0,1.0,4,5.0,W,2.0,3.0,1.0,1.0,1.0,5.0,5.0,9.0,3.0,1.0,2,1,4,3.0,7,6,3,5,7,3,6,3,3,4,1,4,3,4,2.0,,2.0,,4.0,4.0,,,1.0,3.0,4.0,1.0,1.0,1.0,4.0,9.0,3.0,1,2,3,0
2.0,1.0,,,,,,8.0,1.0,,,2.0,1.0,,2.0,6.0,9A,9.0,52.0,5,4,1,1,5,5,5,5,,,10,,,,10,10,,,,,,,6.0,,,,,,1.0,2.0,4,8,9,,6.0,3.0,,2.0,1,7.0,,D19_HAUS_DEKO,,6.0,,,,,6.0,5.0,6.0,,,9,,10,9,,6.0,,1.0,1.0,4,8,10,,,,1.0,7,10,10,,5.0,6.0,,1.0,1992-02-12 00:00:00,1994.0,2.0,40.0,1,2,5,1,2,5,2,4.0,1.0,4.0,1934.0,50.0,10,1.0,1.0,,1,8.0,,2.0,4.0,4.0,1.0,4.0,,,,4.0,1.0,3.0,2.0,4.0,,2.0,1.0,4.0,2.0,3.0,5.0,1.0,2.0,3.0,3.0,3.0,2.0,5.0,2.0,1.0,2.0,3.0,3.0,3.0,2.0,5.0,4.0,3.0,2.0,2.0,1.0,3.0,4.0,4.0,2.0,3.0,3.0,,1.0,3.0,3.0,4.0,3.0,,,,2.0,2.0,4.0,3.0,,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,,463.0,3.0,4.0,1.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,4.0,3.0,5.0,3.0,2.0,3.0,4.0,1.0,3.0,3.0,2.0,2.0,1.0,1.0,1.0,3.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,2.0,5.0,3.0,1.0,3.0,3.0,1.0,3.0,4.0,4.0,2.0,1.0,1.0,1.0,3.0,3.0,2.0,4.0,2.0,2.0,3.0,2.0,4.0,4.0,,1.0,1.0,4.0,3.0,3.0,3.0,,3.0,,4.0,2.0,3.0,2.0,2.0,5.0,5.0,3.0,5.0,3.0,2.0,2.0,5.0,5.0,1.0,3.0,1.0,3.0,2.0,5.0,5.0,3.0,3.0,1.0,5.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,4.0,3.0,4,5.0,,10.0,5.0,40.0,12.0,10,5,1992.0,3.0,5.0,1.0,3,2.0,W,3.0,2.0,,,1.0,3.0,3.0,2.0,5.0,2.0,3,3,5,3.0,1,7,4,3,3,4,7,1,1,1,3,6,4,6,3.0,,3.0,,4.0,5.0,,,1.0,1.0,2.0,6.0,3.0,2.0,4.0,9.0,7.0,1,1,4,0
,1.0,19.0,3.0,7.0,,,17.0,1.0,,2.0,3.0,1.0,,1.0,4.0,3D,3.0,25.0,4,1,4,4,3,1,2,3,,,10,,,,10,10,,,,5.0,,,6.0,,,,,,3.0,4.0,5,8,5,10.0,,2.0,2.0,1.0,1,7.0,,D19_TECHNIK,7.0,,,6.0,6.0,3.0,6.0,3.0,2.0,,,9,6.0,9,10,,6.0,,3.0,4.0,5,8,5,10.0,5.0,,,8,8,10,,6.0,6.0,,1.0,1992-02-12 00:00:00,1994.0,2.0,38.0,3,1,4,3,3,2,6,5.0,1.0,5.0,1976.0,50.0,9,1.0,2.0,,2,6.0,1.0,3.0,2.0,5.0,1.0,4.0,1.0,,,4.0,1.0,1.0,4.0,4.0,1.0,2.0,1.0,5.0,3.0,3.0,5.0,1.0,1.0,3.0,2.0,3.0,3.0,4.0,1.0,2.0,2.0,2.0,3.0,4.0,1.0,5.0,1.0,3.0,3.0,2.0,1.0,4.0,3.0,2.0,1.0,1.0,3.0,2.0,,2.0,1.0,3.0,4.0,2.0,,1.0,1.0,1.0,2.0,3.0,3.0,4.0,2.0,1.0,2.0,1.0,4.0,4.0,2.0,3.0,2.0,1.0,,1126.0,3.0,4.0,1.0,2.0,2.0,4.0,4.0,3.0,,4.0,,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,4.0,4.0,5.0,3.0,2.0,1.0,3.0,4.0,5.0,5.0,3.0,3.0,3.0,2.0,2.0,3.0,4.0,3.0,5.0,1.0,4.0,,1.0,2.0,3.0,3.0,4.0,3.0,3.0,1.0,3.0,2.0,4.0,3.0,2.0,2.0,2.0,2.0,3.0,4.0,3.0,3.0,2.0,2.0,3.0,,3.0,2.0,3.0,2.0,1.0,3.0,3.0,3.0,4.0,1.0,3.0,3.0,5.0,2.0,2.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,5.0,4.0,3.0,4.0,3.0,3.0,2.0,3.0,5.0,1.0,2,3.0,,11.0,5.0,35.0,11.0,10,5,1992.0,5.0,5.0,1.0,5,2.0,W,3.0,2.0,1.0,,1.0,5.0,4.0,15.0,2.0,1.0,4,4,2,4.0,4,3,7,1,1,7,3,7,5,5,6,7,5,6,3.0,,2.0,,1.0,4.0,,,1.0,4.0,1.0,2.0,3.0,3.0,1.0,9.0,7.0,1,1,2,0


# Testing Data Sample 

In [13]:
show_sample(df_mailout_test_base)

AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,ARBEIT,BALLRAUM,CAMEO_DEU_2015,CAMEO_DEUG_2015,CAMEO_INTL_2015,CJT_GESAMTTYP,CJT_KATALOGNUTZER,CJT_TYP_1,CJT_TYP_2,CJT_TYP_3,CJT_TYP_4,CJT_TYP_5,CJT_TYP_6,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_OFFLINE_DATUM,D19_BANKEN_ONLINE_DATUM,D19_BANKEN_ONLINE_QUOTE_12,D19_BANKEN_REST,D19_BEKLEIDUNG_GEH,D19_BEKLEIDUNG_REST,D19_BILDUNG,D19_BIO_OEKO,D19_BUCH_CD,D19_DIGIT_SERV,D19_DROGERIEARTIKEL,D19_ENERGIE,D19_FREIZEIT,D19_GARTEN,D19_GESAMT_ANZ_12,D19_GESAMT_ANZ_24,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,D19_GESAMT_ONLINE_QUOTE_12,D19_HANDWERK,D19_HAUS_DEKO,D19_KINDERARTIKEL,D19_KONSUMTYP,D19_KONSUMTYP_MAX,D19_KOSMETIK,D19_LEBENSMITTEL,D19_LETZTER_KAUF_BRANCHE,D19_LOTTO,D19_NAHRUNGSERGAENZUNG,D19_RATGEBER,D19_REISEN,D19_SAMMELARTIKEL,D19_SCHUHE,D19_SONSTIGE,D19_SOZIALES,D19_TECHNIK,D19_TELKO_ANZ_12,D19_TELKO_ANZ_24,D19_TELKO_DATUM,D19_TELKO_MOBILE,D19_TELKO_OFFLINE_DATUM,D19_TELKO_ONLINE_DATUM,D19_TELKO_ONLINE_QUOTE_12,D19_TELKO_REST,D19_TIERARTIKEL,D19_VERSAND_ANZ_12,D19_VERSAND_ANZ_24,D19_VERSAND_DATUM,D19_VERSAND_OFFLINE_DATUM,D19_VERSAND_ONLINE_DATUM,D19_VERSAND_ONLINE_QUOTE_12,D19_VERSAND_REST,D19_VERSI_ANZ_12,D19_VERSI_ANZ_24,D19_VERSI_DATUM,D19_VERSI_OFFLINE_DATUM,D19_VERSI_ONLINE_DATUM,D19_VERSI_ONLINE_QUOTE_12,D19_VERSICHERUNGEN,D19_VOLLSORTIMENT,D19_WEIN_FEINKOST,DSL_FLAG,EINGEFUEGT_AM,EINGEZOGENAM_HH_JAHR,EWDICHTE,EXTSEL992,FINANZ_ANLEGER,FINANZ_HAUSBAUER,FINANZ_MINIMALIST,FINANZ_SPARER,FINANZ_UNAUFFAELLIGER,FINANZ_VORSORGER,FINANZTYP,FIRMENDICHTE,GEBAEUDETYP,GEBAEUDETYP_RASTER,GEBURTSJAHR,GEMEINDETYP,GFK_URLAUBERTYP,GREEN_AVANTGARDE,HEALTH_TYP,HH_DELTA_FLAG,HH_EINKOMMEN_SCORE,INNENSTADT,KBA05_ALTER1,KBA05_ALTER2,KBA05_ALTER3,KBA05_ALTER4,KBA05_ANHANG,KBA05_ANTG1,KBA05_ANTG2,KBA05_ANTG3,KBA05_ANTG4,KBA05_AUTOQUOT,KBA05_BAUMAX,KBA05_CCM1,KBA05_CCM2,KBA05_CCM3,KBA05_CCM4,KBA05_DIESEL,KBA05_FRAU,KBA05_GBZ,KBA05_HERST1,KBA05_HERST2,KBA05_HERST3,KBA05_HERST4,KBA05_HERST5,KBA05_HERSTTEMP,KBA05_KRSAQUOT,KBA05_KRSHERST1,KBA05_KRSHERST2,KBA05_KRSHERST3,KBA05_KRSKLEIN,KBA05_KRSOBER,KBA05_KRSVAN,KBA05_KRSZUL,KBA05_KW1,KBA05_KW2,KBA05_KW3,KBA05_MAXAH,KBA05_MAXBJ,KBA05_MAXHERST,KBA05_MAXSEG,KBA05_MAXVORB,KBA05_MOD1,KBA05_MOD2,KBA05_MOD3,KBA05_MOD4,KBA05_MOD8,KBA05_MODTEMP,KBA05_MOTOR,KBA05_MOTRAD,KBA05_SEG1,KBA05_SEG10,KBA05_SEG2,KBA05_SEG3,KBA05_SEG4,KBA05_SEG5,KBA05_SEG6,KBA05_SEG7,KBA05_SEG8,KBA05_SEG9,KBA05_VORB0,KBA05_VORB1,KBA05_VORB2,KBA05_ZUL1,KBA05_ZUL2,KBA05_ZUL3,KBA05_ZUL4,KBA13_ALTERHALTER_30,KBA13_ALTERHALTER_45,KBA13_ALTERHALTER_60,KBA13_ALTERHALTER_61,KBA13_ANTG1,KBA13_ANTG2,KBA13_ANTG3,KBA13_ANTG4,KBA13_ANZAHL_PKW,KBA13_AUDI,KBA13_AUTOQUOTE,KBA13_BAUMAX,KBA13_BJ_1999,KBA13_BJ_2000,KBA13_BJ_2004,KBA13_BJ_2006,KBA13_BJ_2008,KBA13_BJ_2009,KBA13_BMW,KBA13_CCM_0_1400,KBA13_CCM_1000,KBA13_CCM_1200,KBA13_CCM_1400,KBA13_CCM_1401_2500,KBA13_CCM_1500,KBA13_CCM_1600,KBA13_CCM_1800,KBA13_CCM_2000,KBA13_CCM_2500,KBA13_CCM_2501,KBA13_CCM_3000,KBA13_CCM_3001,KBA13_FAB_ASIEN,KBA13_FAB_SONSTIGE,KBA13_FIAT,KBA13_FORD,KBA13_GBZ,KBA13_HALTER_20,KBA13_HALTER_25,KBA13_HALTER_30,KBA13_HALTER_35,KBA13_HALTER_40,KBA13_HALTER_45,KBA13_HALTER_50,KBA13_HALTER_55,KBA13_HALTER_60,KBA13_HALTER_65,KBA13_HALTER_66,KBA13_HERST_ASIEN,KBA13_HERST_AUDI_VW,KBA13_HERST_BMW_BENZ,KBA13_HERST_EUROPA,KBA13_HERST_FORD_OPEL,KBA13_HERST_SONST,KBA13_HHZ,KBA13_KMH_0_140,KBA13_KMH_110,KBA13_KMH_140,KBA13_KMH_140_210,KBA13_KMH_180,KBA13_KMH_210,KBA13_KMH_211,KBA13_KMH_250,KBA13_KMH_251,KBA13_KRSAQUOT,KBA13_KRSHERST_AUDI_VW,KBA13_KRSHERST_BMW_BENZ,KBA13_KRSHERST_FORD_OPEL,KBA13_KRSSEG_KLEIN,KBA13_KRSSEG_OBER,KBA13_KRSSEG_VAN,KBA13_KRSZUL_NEU,KBA13_KW_0_60,KBA13_KW_110,KBA13_KW_120,KBA13_KW_121,KBA13_KW_30,KBA13_KW_40,KBA13_KW_50,KBA13_KW_60,KBA13_KW_61_120,KBA13_KW_70,KBA13_KW_80,KBA13_KW_90,KBA13_MAZDA,KBA13_MERCEDES,KBA13_MOTOR,KBA13_NISSAN,KBA13_OPEL,KBA13_PEUGEOT,KBA13_RENAULT,KBA13_SEG_GELAENDEWAGEN,KBA13_SEG_GROSSRAUMVANS,KBA13_SEG_KLEINST,KBA13_SEG_KLEINWAGEN,KBA13_SEG_KOMPAKTKLASSE,KBA13_SEG_MINIVANS,KBA13_SEG_MINIWAGEN,KBA13_SEG_MITTELKLASSE,KBA13_SEG_OBEREMITTELKLASSE,KBA13_SEG_OBERKLASSE,KBA13_SEG_SONSTIGE,KBA13_SEG_SPORTWAGEN,KBA13_SEG_UTILITIES,KBA13_SEG_VAN,KBA13_SEG_WOHNMOBILE,KBA13_SITZE_4,KBA13_SITZE_5,KBA13_SITZE_6,KBA13_TOYOTA,KBA13_VORB_0,KBA13_VORB_1,KBA13_VORB_1_2,KBA13_VORB_2,KBA13_VORB_3,KBA13_VW,KK_KUNDENTYP,KKK,KOMBIALTER,KONSUMNAEHE,KONSUMZELLE,LP_FAMILIE_FEIN,LP_FAMILIE_GROB,LP_LEBENSPHASE_FEIN,LP_LEBENSPHASE_GROB,LP_STATUS_FEIN,LP_STATUS_GROB,MIN_GEBAEUDEJAHR,MOBI_RASTER,MOBI_REGIO,NATIONALITAET_KZ,ONLINE_AFFINITAET,ORTSGR_KLS9,OST_WEST_KZ,PLZ8_ANTG1,PLZ8_ANTG2,PLZ8_ANTG3,PLZ8_ANTG4,PLZ8_BAUMAX,PLZ8_GBZ,PLZ8_HHZ,PRAEGENDE_JUGENDJAHRE,REGIOTYP,RELAT_AB,RETOURTYP_BK_S,RT_KEIN_ANREIZ,RT_SCHNAEPPCHEN,RT_UEBERGROESSE,SEMIO_DOM,SEMIO_ERL,SEMIO_FAM,SEMIO_KAEM,SEMIO_KRIT,SEMIO_KULT,SEMIO_LUST,SEMIO_MAT,SEMIO_PFLICHT,SEMIO_RAT,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,SOHO_KZ,STRUKTURTYP,TITEL_KZ,UMFELD_ALT,UMFELD_JUNG,UNGLEICHENN_FLAG,VERDICHTUNGSRAUM,VERS_TYP,VHA,VHN,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB
,1.0,19.0,,,,,15.0,2.0,,,4.0,2.0,,4.0,1.0,1C,1.0,14.0,6,1,2,3,3,4,5,3,,,8,,,,10,10,,6.0,,4.0,,,3.0,,,5.0,,,2.0,4.0,4,6,4,10.0,,6.0,,1.0,1,,,D19_SCHUHE,7.0,,6.0,7.0,6.0,2.0,6.0,4.0,6.0,,,9,6.0,9,10,,,,2.0,4.0,4,9,4,10.0,6.0,,,10,10,10,,,5.0,,1.0,1996-08-13 00:00:00,1994.0,4.0,36.0,4,2,4,2,1,3,6,5.0,1.0,5.0,1967.0,11.0,1,1.0,2.0,,1,3.0,2.0,3.0,4.0,1.0,1.0,4.0,,,,4.0,1.0,5.0,1.0,3.0,,2.0,3.0,3.0,1.0,2.0,3.0,4.0,4.0,3.0,5.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,4.0,4.0,1.0,2.0,,1.0,3.0,3.0,2.0,3.0,1.0,,3.0,2.0,4.0,3.0,3.0,1.0,,1.0,,1.0,2.0,3.0,3.0,2.0,3.0,3.0,5.0,3.0,3.0,2.0,4.0,4.0,1.0,,,420.0,3.0,3.0,1.0,2.0,2.0,4.0,4.0,3.0,3.0,3.0,1.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,2.0,4.0,1.0,3.0,4.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,5.0,3.0,4.0,3.0,2.0,3.0,4.0,3.0,,1.0,3.0,2.0,2.0,4.0,4.0,4.0,1.0,5.0,5.0,4.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,5.0,3.0,2.0,2.0,,1.0,4.0,1.0,4.0,5.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,1.0,4.0,3.0,2.0,2.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,1.0,4.0,4.0,,3,4.0,,11.0,5.0,39.0,12.0,10,5,1994.0,4.0,5.0,1.0,5,8.0,O,4.0,1.0,,,1.0,4.0,3.0,13.0,,4.0,3,2,3,3.0,7,5,2,7,7,4,4,7,4,4,4,4,7,4,1.0,,3.0,,5.0,5.0,,10.0,1.0,4.0,,2.0,1.0,2.0,6.0,9.0,3.0,1,2,2
3.0,1.0,8.0,,,,,8.0,2.0,,,1.0,2.0,,2.0,7.0,3D,3.0,25.0,2,5,2,1,5,5,3,3,,,10,,,,10,10,,,,,,6.0,6.0,,,,,,,,9,9,10,,,,,3.0,2,,,D19_UNBEKANNT,,,,,,,7.0,1.0,,,,10,,10,10,,,,,,9,9,10,,,,,10,10,10,,,6.0,,1.0,1994-11-02 00:00:00,1997.0,2.0,36.0,4,4,4,1,1,5,6,3.0,1.0,3.0,1934.0,50.0,2,,3.0,,4,6.0,4.0,2.0,3.0,2.0,3.0,2.0,,,,5.0,,3.0,4.0,2.0,,2.0,3.0,4.0,1.0,4.0,4.0,2.0,3.0,2.0,5.0,3.0,4.0,4.0,1.0,2.0,2.0,1.0,3.0,3.0,1.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,3.0,3.0,,3.0,2.0,1.0,,1.0,3.0,3.0,3.0,1.0,,,1.0,1.0,2.0,3.0,4.0,4.0,3.0,,1.0,4.0,4.0,5.0,1.0,4.0,1.0,,,633.0,4.0,5.0,1.0,3.0,4.0,3.0,2.0,2.0,2.0,4.0,2.0,2.0,,2.0,3.0,5.0,2.0,5.0,4.0,,3.0,4.0,1.0,3.0,5.0,1.0,2.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,5.0,4.0,4.0,2.0,1.0,2.0,4.0,3.0,1.0,2.0,5.0,3.0,,1.0,2.0,3.0,1.0,5.0,3.0,3.0,1.0,3.0,2.0,5.0,2.0,2.0,2.0,1.0,1.0,1.0,3.0,,3.0,1.0,2.0,3.0,,5.0,3.0,3.0,5.0,2.0,3.0,3.0,3.0,2.0,1.0,3.0,4.0,1.0,3.0,3.0,2.0,2.0,3.0,5.0,3.0,3.0,3.0,2.0,4.0,1.0,2.0,2.0,4.0,2.0,1.0,2.0,3.0,3.0,3.0,5.0,4.0,,4.0,4,6.0,,1.0,1.0,3.0,1.0,4,2,1994.0,5.0,4.0,1.0,2,1.0,O,4.0,2.0,,,1.0,4.0,3.0,1.0,7.0,1.0,2,3,3,2.0,4,4,4,4,4,5,2,4,6,7,7,3,6,5,,,1.0,,4.0,4.0,,,1.0,5.0,4.0,6.0,3.0,2.0,6.0,9.0,7.0,6,2,1
2.0,1.0,,,,,,8.0,13.0,,,2.0,13.0,,4.0,4.0,9C,9.0,51.0,2,5,2,2,5,5,5,5,,,10,,,,10,10,,,,,,,,,,,,6.0,1.0,2.0,5,5,9,,,3.0,,3.0,2,,6.0,D19_HAUS_DEKO,6.0,6.0,,,,,,3.0,,,1.0,7,,10,10,,5.0,,1.0,1.0,5,5,9,,,,,9,10,10,,6.0,6.0,,1.0,1992-02-10 00:00:00,2002.0,5.0,,1,5,2,1,1,5,6,4.0,1.0,4.0,1931.0,22.0,3,,2.0,,6,6.0,3.0,3.0,2.0,3.0,,,,,2.0,2.0,4.0,3.0,5.0,2.0,,,1.0,1.0,2.0,3.0,2.0,4.0,5.0,3.0,1.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,4.0,3.0,,2.0,4.0,5.0,1.0,1.0,,3.0,5.0,3.0,,4.0,2.0,,,,4.0,3.0,2.0,2.0,,,,,4.0,1.0,2.0,3.0,1.0,4.0,3.0,5.0,2.0,2.0,4.0,1.0,1.0,3.0,2.0,517.0,3.0,2.0,4.0,1.0,2.0,3.0,4.0,3.0,5.0,2.0,5.0,5.0,5.0,3.0,1.0,1.0,3.0,1.0,1.0,2.0,3.0,3.0,1.0,3.0,5.0,3.0,3.0,2.0,4.0,5.0,5.0,4.0,2.0,1.0,2.0,2.0,3.0,3.0,4.0,4.0,2.0,2.0,5.0,3.0,5.0,4.0,5.0,3.0,4.0,2.0,4.0,1.0,,,1.0,1.0,1.0,3.0,3.0,2.0,1.0,2.0,3.0,4.0,1.0,3.0,2.0,3.0,5.0,3.0,2.0,2.0,5.0,2.0,1.0,4.0,1.0,2.0,5.0,3.0,4.0,5.0,1.0,2.0,5.0,5.0,4.0,2.0,4.0,4.0,1.0,1.0,4.0,1.0,1.0,2.0,,1.0,5.0,1.0,3.0,5.0,2.0,1.0,2.0,2.0,2.0,6.0,1.0,4,2.0,,10.0,5.0,31.0,10.0,1,1,1992.0,1.0,1.0,1.0,2,5.0,O,1.0,3.0,3.0,2.0,4.0,2.0,4.0,1.0,3.0,3.0,5,2,5,1.0,5,7,1,6,6,1,7,1,4,2,1,5,3,2,3.0,,3.0,,3.0,3.0,,,2.0,5.0,4.0,6.0,8.0,5.0,5.0,9.0,3.0,3,2,4
2.0,1.0,7.0,,,,,6.0,1.0,,,2.0,1.0,,4.0,2.0,8A,8.0,51.0,6,5,1,2,5,5,5,5,,,10,,,,10,10,,,,6.0,,,,,,3.0,,3.0,3.0,3.0,5,5,9,,,3.0,,2.0,1,,,D19_SONSTIGE,,,,,,6.0,3.0,3.0,,,,10,,10,10,,,,2.0,2.0,5,5,9,,,,,10,10,10,,,,,1.0,1992-02-12 00:00:00,1997.0,5.0,56.0,1,3,5,1,1,5,5,1.0,1.0,3.0,,21.0,6,,2.0,,4,5.0,3.0,2.0,4.0,3.0,1.0,1.0,2.0,,,3.0,5.0,2.0,3.0,3.0,1.0,4.0,3.0,3.0,4.0,2.0,4.0,3.0,2.0,3.0,3.0,4.0,3.0,3.0,2.0,2.0,1.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0,3.0,2.0,3.0,3.0,4.0,3.0,,1.0,3.0,1.0,1.0,1.0,3.0,3.0,5.0,2.0,,,2.0,,3.0,3.0,4.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,1.0,1400.0,3.0,3.0,1.0,3.0,3.0,2.0,2.0,3.0,3.0,4.0,3.0,,3.0,3.0,2.0,4.0,3.0,2.0,3.0,,4.0,3.0,4.0,3.0,2.0,3.0,5.0,5.0,3.0,3.0,2.0,2.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,1.0,4.0,3.0,5.0,2.0,5.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,1.0,,4.0,3.0,2.0,2.0,,3.0,2.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,1.0,5.0,4.0,4,1.0,1.0,2.0,2.0,19.0,5.0,9,4,1992.0,1.0,3.0,1.0,3,6.0,W,2.0,3.0,2.0,1.0,1.0,5.0,5.0,1.0,6.0,3.0,5,2,3,1.0,3,3,6,2,3,4,5,4,2,3,3,3,2,6,2.0,,3.0,,3.0,3.0,,1.0,1.0,1.0,3.0,1.0,1.0,1.0,6.0,9.0,4.0,1,1,4
1.0,1.0,4.0,,,,,4.0,2.0,,,,4.0,,2.0,2.0,8C,8.0,54.0,1,5,2,2,5,5,5,5,,,10,,,,10,10,,,,,,,,,,,,,,,9,9,10,,6.0,6.0,,9.0,8,,,D19_UNBEKANNT,7.0,,,6.0,,,6.0,,,,,10,,10,10,,,,,,9,9,10,,,,,10,10,10,,,,,1.0,1992-02-10 00:00:00,1994.0,5.0,38.0,1,3,4,1,2,5,2,3.0,3.0,3.0,,22.0,11,,1.0,,6,4.0,1.0,3.0,1.0,5.0,,,3.0,3.0,,2.0,3.0,1.0,4.0,3.0,3.0,,1.0,2.0,3.0,1.0,5.0,1.0,4.0,3.0,1.0,3.0,1.0,5.0,1.0,3.0,2.0,1.0,1.0,4.0,3.0,5.0,3.0,3.0,2.0,1.0,,4.0,4.0,1.0,2.0,4.0,2.0,,,1.0,1.0,4.0,3.0,4.0,1.0,,,2.0,4.0,2.0,,3.0,4.0,2.0,1.0,3.0,3.0,3.0,3.0,1.0,3.0,3.0,2.0,759.0,4.0,4.0,4.0,4.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,,3.0,4.0,3.0,1.0,4.0,2.0,2.0,3.0,,3.0,1.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,2.0,3.0,2.0,3.0,3.0,1.0,4.0,3.0,4.0,2.0,3.0,3.0,1.0,4.0,3.0,3.0,3.0,2.0,1.0,1.0,2.0,4.0,,5.0,2.0,1.0,,3.0,4.0,3.0,4.0,,3.0,5.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,3.0,5.0,3.0,3.0,4.0,3.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,4.0,3.0,3.0,,4.0,4,3.0,,,,,,1,1,1992.0,1.0,2.0,3.0,1,5.0,W,1.0,4.0,3.0,2.0,4.0,2.0,3.0,1.0,6.0,2.0,2,3,3,2.0,7,6,1,7,4,3,6,2,3,4,1,4,3,5,3.0,,3.0,,3.0,5.0,,4.0,2.0,5.0,3.0,10.0,7.0,4.0,,9.0,4.0,3,2,3
,6.0,17.0,8.0,12.0,15.0,,15.0,1.0,,3.0,4.0,1.0,,3.0,5.0,2C,2.0,14.0,3,1,2,2,4,4,5,4,,,10,,,,10,10,,,5.0,6.0,,,6.0,,,,,,,2.0,6,6,7,,,6.0,6.0,3.0,2,,6.0,D19_SONSTIGE,,,7.0,,,6.0,5.0,,,,,9,6.0,10,10,,,,,2.0,7,9,7,,6.0,,,10,10,10,,,5.0,,1.0,1992-02-10 00:00:00,2003.0,1.0,50.0,3,1,5,2,2,4,6,4.0,1.0,4.0,1966.0,40.0,9,,3.0,,2,7.0,2.0,3.0,5.0,2.0,1.0,3.0,,,,5.0,1.0,4.0,2.0,3.0,2.0,3.0,3.0,5.0,2.0,3.0,2.0,4.0,3.0,4.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,4.0,2.0,1.0,3.0,2.0,4.0,1.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,1.0,,2.0,2.0,4.0,2.0,3.0,2.0,,,,2.0,1.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,4.0,4.0,1.0,3.0,2.0,,,1124.0,3.0,4.0,1.0,3.0,3.0,3.0,3.0,,3.0,2.0,1.0,,1.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,2.0,2.0,5.0,3.0,2.0,3.0,4.0,4.0,5.0,4.0,4.0,3.0,2.0,2.0,3.0,4.0,4.0,2.0,2.0,3.0,4.0,,1.0,2.0,3.0,3.0,4.0,,,1.0,3.0,3.0,3.0,3.0,2.0,1.0,2.0,1.0,3.0,4.0,,3.0,2.0,,2.0,4.0,3.0,,3.0,,2.0,5.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,4.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,4.0,5.0,,2.0,3,4.0,,11.0,5.0,37.0,12.0,9,4,1992.0,4.0,5.0,1.0,4,3.0,W,3.0,2.0,,,1.0,5.0,5.0,10.0,2.0,2.0,3,4,4,5.0,2,4,5,3,3,6,4,6,5,5,4,4,4,5,,,1.0,,3.0,4.0,1.0,,2.0,,1.0,1.0,1.0,1.0,1.0,9.0,7.0,1,1,3
,1.0,6.0,,,,,,4.0,,,1.0,2.0,,2.0,7.0,4E,4.0,25.0,2,5,2,1,5,5,5,5,,,10,,,,10,10,,,6.0,6.0,7.0,,6.0,,,,,,,,8,8,8,,6.0,,,3.0,2,6.0,,D19_BEKLEIDUNG_GEH,7.0,,,,6.0,,6.0,1.0,7.0,,,10,,10,10,,,,,,8,8,8,,,,,9,10,10,,6.0,,,1.0,1992-02-10 00:00:00,1993.0,4.0,56.0,1,5,3,2,2,5,5,3.0,1.0,4.0,,22.0,4,,2.0,,6,6.0,2.0,2.0,2.0,5.0,2.0,,4.0,1.0,,3.0,,1.0,4.0,3.0,3.0,2.0,4.0,3.0,3.0,4.0,2.0,3.0,3.0,2.0,2.0,4.0,3.0,2.0,1.0,2.0,3.0,1.0,2.0,3.0,3.0,5.0,2.0,2.0,2.0,1.0,2.0,3.0,3.0,1.0,3.0,4.0,3.0,1.0,2.0,3.0,2.0,3.0,3.0,2.0,1.0,1.0,1.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,1.0,3.0,3.0,2.0,4.0,1.0,4.0,2.0,1.0,413.0,5.0,2.0,5.0,4.0,4.0,2.0,3.0,3.0,1.0,3.0,2.0,3.0,,2.0,4.0,1.0,3.0,5.0,3.0,5.0,3.0,3.0,1.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,4.0,4.0,5.0,3.0,2.0,3.0,2.0,3.0,3.0,4.0,1.0,4.0,2.0,3.0,3.0,3.0,3.0,1.0,2.0,3.0,3.0,3.0,2.0,2.0,2.0,1.0,1.0,3.0,5.0,,1.0,3.0,2.0,1.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,2.0,3.0,3.0,5.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,5.0,4.0,2.0,4.0,3.0,3.0,2.0,3.0,4.0,3.0,3.0,,3.0,4,2.0,,1.0,1.0,6.0,2.0,1,1,1992.0,1.0,2.0,1.0,1,5.0,W,1.0,4.0,2.0,1.0,5.0,3.0,3.0,3.0,7.0,3.0,5,3,5,2.0,5,7,2,6,6,1,7,1,1,3,1,4,1,3,3.0,,2.0,,2.0,3.0,,,2.0,5.0,4.0,4.0,2.0,1.0,6.0,9.0,2.0,3,2,4
,,,,,,,,,,,,,,,,,,,6,5,5,5,5,5,5,5,,,10,,,,10,10,,,,,,,,,,,,,,,10,10,10,,,,,,9,,,,,,,,,,,,,,,10,,10,10,,,,,,10,10,10,,,,,10,10,10,,,,,,,,,,5,3,3,4,5,3,4,,,,,,5,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9,,,,,,,5,2,,,,,2,,,,,,,,,,,,,3,4,5,,6,3,6,6,7,3,5,5,5,4,7,2,3,1,,,,,,,,,,,,,,,,,,3,2,1
2.0,1.0,19.0,,,,,13.0,1.0,,,3.0,1.0,,2.0,3.0,7C,7.0,54.0,6,3,2,3,4,4,5,4,,,10,,,,10,10,,,3.0,,,6.0,,,,,,,1.0,2.0,1,1,7,,,,,2.0,1,,,D19_BEKLEIDUNG_GEH,,,,7.0,,,6.0,5.0,5.0,,,10,,10,10,,,,1.0,2.0,1,1,7,,,,,10,10,10,,,,,1.0,1996-06-12 00:00:00,2004.0,2.0,,2,3,5,1,1,5,2,4.0,3.0,4.0,1957.0,40.0,1,,1.0,,1,5.0,2.0,3.0,3.0,4.0,1.0,4.0,,,,4.0,1.0,2.0,3.0,4.0,,2.0,3.0,5.0,1.0,3.0,5.0,2.0,2.0,3.0,3.0,2.0,3.0,5.0,2.0,2.0,3.0,2.0,3.0,4.0,1.0,5.0,2.0,3.0,3.0,2.0,,4.0,3.0,2.0,2.0,1.0,3.0,,1.0,2.0,3.0,3.0,3.0,2.0,,1.0,,2.0,2.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,1.0,1.0,464.0,4.0,3.0,1.0,4.0,4.0,3.0,2.0,3.0,2.0,2.0,,1.0,3.0,4.0,3.0,5.0,3.0,3.0,2.0,1.0,4.0,3.0,4.0,2.0,1.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,5.0,5.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,5.0,1.0,3.0,1.0,1.0,1.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,1.0,3.0,5.0,2.0,3.0,3.0,1.0,4.0,1.0,1.0,3.0,1.0,1.0,3.0,5.0,2.0,3.0,2.0,4.0,3.0,3.0,1.0,4.0,5.0,3.0,2.0,1.0,5.0,2.0,2.0,3.0,5.0,3.0,4.0,3.0,2.0,2.0,2.0,4.0,5.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,4.0,4.0,3.0,2.0,3.0,,4,5.0,,10.0,5.0,37.0,12.0,9,4,1996.0,4.0,5.0,1.0,4,3.0,W,3.0,2.0,1.0,1.0,1.0,3.0,3.0,8.0,,3.0,4,1,5,4.0,4,6,4,5,4,3,4,3,4,4,1,4,4,4,3.0,,2.0,,3.0,5.0,,,1.0,,,3.0,5.0,4.0,6.0,9.0,7.0,1,2,3
,1.0,21.0,,,,,16.0,1.0,,,4.0,1.0,,3.0,4.0,5A,5.0,31.0,1,3,4,4,3,2,2,3,,,10,,,,10,10,,,,,,,,,,,,,,1.0,7,10,7,,6.0,,,3.0,2,,,D19_VOLLSORTIMENT,,,,,,,,1.0,,,,10,,10,10,,,,,1.0,7,10,7,,,,,10,10,10,,,5.0,,1.0,1995-08-14 00:00:00,1996.0,2.0,,5,2,4,2,1,3,6,4.0,1.0,4.0,1970.0,30.0,9,,2.0,,4,6.0,3.0,3.0,4.0,2.0,2.0,1.0,1.0,,,5.0,,5.0,2.0,2.0,1.0,1.0,2.0,5.0,1.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,4.0,2.0,2.0,2.0,2.0,5.0,1.0,1.0,3.0,2.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,1.0,4.0,1.0,2.0,2.0,2.0,4.0,3.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,3.0,3.0,3.0,3.0,2.0,2.0,5.0,3.0,3.0,2.0,3.0,1.0,,,241.0,3.0,4.0,1.0,5.0,5.0,2.0,1.0,2.0,2.0,2.0,5.0,2.0,5.0,5.0,2.0,3.0,1.0,3.0,2.0,2.0,2.0,,1.0,5.0,5.0,3.0,2.0,3.0,5.0,5.0,4.0,4.0,2.0,2.0,3.0,3.0,5.0,2.0,2.0,4.0,3.0,2.0,4.0,3.0,5.0,2.0,3.0,2.0,1.0,4.0,5.0,2.0,2.0,2.0,1.0,4.0,3.0,3.0,3.0,2.0,2.0,2.0,1.0,5.0,2.0,3.0,2.0,2.0,2.0,5.0,5.0,2.0,3.0,,1.0,2.0,1.0,3.0,4.0,4.0,4.0,3.0,3.0,2.0,5.0,4.0,4.0,2.0,5.0,2.0,2.0,3.0,5.0,1.0,3.0,2.0,2.0,1.0,5.0,2.0,2.0,1.0,1.0,2.0,4.0,5.0,3.0,,,3,6.0,,11.0,5.0,34.0,11.0,9,4,1995.0,4.0,5.0,1.0,3,3.0,O,2.0,2.0,,,1.0,3.0,2.0,12.0,,2.0,1,4,5,5.0,7,5,5,5,7,4,4,7,4,4,4,3,7,4,1.0,,2.0,,5.0,5.0,,,1.0,,,7.0,8.0,5.0,2.0,9.0,3.0,6,2,2


# Machine Learning Algorithms

We will use a bunch of algorithms, hyperparameters and feature and data selection

## MLflow Experiment Tracking

We will track the differet runs divided them in 3 main experiments

- Training with customers vs German population data
- Training with train-specific data
- Training with both approaches described above

For each of the experiments we will try different hyperparameters, categorical encoding and feature selection

When we´ve got the best model, we will create a ML project for made the model easy accessible by anyone from source code or directly from github

We will use two of the three parts of `MLflow`, tracking and projects

![MLflow](https://secure.meetupstatic.com/photos/event/9/4/7/a/600_483818010.jpeg)

#  Create The Experiments

In [14]:
experiment_id_customers_vs_german_population = new_experiment(name='Customers-vs-German-Population')
experiment_id_training_data_provided = new_experiment(name='Training-Data-Provided')
experiment_id_mixed_data = new_experiment(name='Mixed-Data')

# First Experiment

We already developed all of the models needed for this experiment, do we only have to track them

In [15]:
zip_run_info = (
    zip(['All possible features', 'Less features', 'With clustering'],
        [catboost_first_model, catboost_less_features_model, catboost_with_clusters_model],
        ['catboost_first_model', 'catboost_less_features_model', 'catboost_with_clusters_model'],
        [auc_first_model, auc_less_features_model, auc_with_clusters_model])
)

Trackings = [
    Tracking(
        run_name=run_name,
        tags=dict(
            model_name=name,
            target=TARGET_CUSTOMER,
            num_features_used=len(model.feature_names_),
            top_feature_importances=(
                Counter(
                    dict(
                        zip(model.feature_names_,
                            model.get_feature_importance())
                    )
                ).most_common(3)
            )
        ),
        params=model.get_params(),
        metrics=dict(AUC=auc),
        model=model,
        model_name=name
    )
    for run_name, model, name, auc in zip_run_info
]


run_ids_customers_vs_german_population = (
    apply_runs_to_experiment(experiment_id_customers_vs_german_population,
                             Trackings)
)

# Second Experiment 

## First Run

We will train with `train-specific data`, there are few examples and it´s probably to perform worse than previous approach, but maybe not because this is the correct target, consisting in a mailout campaign 

`HelmertEncoder` encoder works very well when categorical features are ordinal (looking at the feature attributes file this is our case) but at this particular case it gets a poor perform, because the bad point about this encoder is that it creates a lot of new columns and that is a problem because of the cardinality of our features and the number of them

We will also test `CatBoostEncoder` and `WOEEncoder`, usually cboth perform very well when encoding categorical features

Oversampling to positive class will be done because we have very few examples and the classes are highly unbalanced

In [16]:
df_mailout_train_base_renamed = df_mailout_train_base.copy()

df_mailout_train_base_renamed.columns = adjust_column_names_for_feature_encoding(
    df_mailout_train_base_renamed.columns
)

cat_features = df_mailout_train_base_renamed.select_dtypes(include=['category', 'object']).columns

cat_features_numerical = [
    column
    for column in cat_features
    if df_mailout_train_base_renamed[column].dropna().str.match('^\\d+$').all()
]

features, labels = preprocessing_baseline(df_mailout_train_base_renamed,
                                          cat_features=set(cat_features).difference(cat_features_numerical),
                                          target=TARGET,
                                          test_size=.15,
                                          valid_size=0)

X_train, X_test, _ = features
y_train, y_test, _ = labels

params_xgb = dict(n_estimators=15,
                  max_depth=3,
                  colsample_bytree=.5,
                  min_child_weight=20,
                  gamma=30,
                  random_state=RANDOM_STATE,
                  n_jobs=-1)

params_catboost = dict(num_trees=15,
                       max_depth=3,
                       min_child_samples=25,
                       one_hot_max_size=4,
                       cat_features=cat_features,
                       random_state=RANDOM_STATE,
                       eval_metric='AUC',
                       verbose=0)

pipeline_xgb_oversampling_woe = make_pipeline(
    RandomOverSampler(sampling_strategy=.02, random_state=RANDOM_STATE),
    WOEEncoder(cols=cat_features_numerical),
    CatBoostEncoder(cols=set(cat_features).difference(cat_features_numerical)),
    XGBClassifier(**params_xgb)
)

pipeline_catboost_oversampling = make_pipeline(
    RandomOverSampler(sampling_strategy=.02, random_state=RANDOM_STATE),
    CatBoostClassifier(**params_catboost)
)

X_train_catboost = cat_features_fillna(X_train, cat_features_numerical)

pipeline_xgb_oversampling_woe.fit(X_train, y_train)
pipeline_catboost_oversampling.fit(X_train_catboost, y_train);

# Cross Validation

4-fold cross validation to be sure about the AUC metric

In [17]:
cross_val_score_xgb_oversampling_woe = cross_val_score(
    pipeline_xgb_oversampling_woe,
    X_train,
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

cross_val_score_catboost_oversampling = cross_val_score(
    pipeline_catboost_oversampling,
    X_train_catboost,
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

print(f'{cross_val_score_xgb_oversampling_woe.mean()}±{cross_val_score_xgb_oversampling_woe.std()}')
print(f'{cross_val_score_catboost_oversampling.mean()}±{cross_val_score_catboost_oversampling.std()}')

0.686208787087107±0.03104046105766505
0.7443360451574116±0.013728413546340953


# Log Experiment Results 

In [18]:
auc_xgb_oversampling_woe = roc_auc_score(
    y_test, pipeline_xgb_oversampling_woe.predict_proba(X_test)[:, 1]
)

auc_catboost_oversampling = roc_auc_score(
    y_test, pipeline_catboost_oversampling.predict_proba(cat_features_fillna(X_test,
                                                                             cat_features_numerical))[:, 1]
)

zip_run_info = (
    zip(['Oversamplig, WoE and CatBoost encoders with XGBoost', 'Oversampling with CatBoost'],
        [pipeline_xgb_oversampling_woe, pipeline_catboost_oversampling],
        ['pipeline_xgb_oversampling_woe', 'pipeline_catboost_oversampling'],
        [auc_xgb_oversampling_woe, auc_catboost_oversampling])
)

Trackings = [
    Tracking(
        run_name=run_name,
        tags=dict(
            model_name=name,
            target=TARGET,
            num_features_used=len(model.steps[-1][1].feature_names_
                                  if name == 'pipeline_catboost_oversampling'
                                  else model.steps[-2][1].get_feature_names()),
            top_feature_importances=(
                Counter(
                    dict(
                        zip(model.steps[-1][1].feature_names_
                            if name == 'pipeline_catboost_oversampling'
                            else model.steps[-2][1].get_feature_names(),
                            model.steps[-1][1].feature_importances_)
                    )
                ).most_common(3)
            )
        ),
        params={param: value
                for param, value in (pipeline_catboost_oversampling
                                     .steps[-1][1]
                                     .get_params()
                                     .items())
                if param != 'cat_features'},
        metrics=dict(AUC=auc),
        model=model,
        model_name=name
    )
    for run_name, model, name, auc in zip_run_info
]


run_ids_training_data_provided = (
    apply_runs_to_experiment(experiment_id_training_data_provided,
                             Trackings)
)

# Kaggle Submission

![Kaggle CatBoost Oversampling](../images/kaggle_catboost_oversampling.png)

In [19]:
mailout_test_lnr = df_mailout_test['LNR']

df_mailout_test_base_renamed = df_mailout_test_base.copy()

df_mailout_test_base_renamed.columns = adjust_column_names_for_feature_encoding(
    df_mailout_test_base_renamed.columns
)

df_mailout_test_base_renamed_cat_fillna = cat_features_fillna(
    df_mailout_test_base_renamed,
    cat_features=cat_features
)

kaggle_submission(
    column_lnr=mailout_test_lnr,
    y_pred=pipeline_catboost_oversampling.predict_proba(df_mailout_test_base_renamed_cat_fillna)[:, 1],
    submission_filename='catboost_oversampling',
   submission_message='Oversampling catboost trained with customer training data [75/15/0]'
)

100%|██████████| 1.09M/1.09M [00:06<00:00, 181kB/s] 


Successfully submitted to Udacity+Arvato: Identify Customer Segments


## Second Run

We will train again with `train-specific data` but now with only the most important features got in previous executions

Now as we have less features, we are goiong to test `HelmertEncoder`

In [20]:
feature_importances_catboost_oversampling = dict(
    zip(pipeline_catboost_oversampling.steps[-1][1].feature_names_,
        pipeline_catboost_oversampling.steps[-1][1].feature_importances_)
)


feature_importances_catboost_oversampling_relevant = [
    feature
    for feature, importance in feature_importances_catboost_oversampling.items()
    if importance > .01
]

params_catboost = dict(num_trees=30,
                       max_depth=3,
                       min_child_samples=25,
                       one_hot_max_size=4,
                       cat_features=set(cat_features).intersection(feature_importances_catboost_oversampling_relevant),
                       random_state=RANDOM_STATE,
                       eval_metric='AUC',
                       verbose=0)

pipeline_xgb_oversampling_helmert_less_features = make_pipeline(
    RandomOverSampler(sampling_strategy=.02, random_state=RANDOM_STATE),
    HelmertEncoder(cols=set(cat_features).intersection(feature_importances_catboost_oversampling_relevant)),
    XGBClassifier(**params_xgb)
)

pipeline_catboost_oversampling_less_features = make_pipeline(
    RandomOverSampler(sampling_strategy=.02, random_state=RANDOM_STATE),
    CatBoostClassifier(**params_catboost)
)

pipeline_xgb_oversampling_helmert_less_features.fit(
    X_train[feature_importances_catboost_oversampling_relevant],
    y_train)

pipeline_catboost_oversampling_less_features.fit(
    X_train_catboost[feature_importances_catboost_oversampling_relevant], 
    y_train);

# Cross Validation

Again we run 4-fold cross validation to be sure about the AUC metric

In [21]:
cross_val_score_xgb_oversampling_helmert_less_features = cross_val_score(
    pipeline_xgb_oversampling_helmert_less_features,
    X_train[feature_importances_catboost_oversampling_relevant],
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

cross_val_score_catboost_oversampling_less_features = cross_val_score(
    pipeline_catboost_oversampling_less_features,
    X_train_catboost[feature_importances_catboost_oversampling_relevant],
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

print(f'{cross_val_score_xgb_oversampling_helmert_less_features.mean()}'
      f'±{cross_val_score_xgb_oversampling_helmert_less_features.std()}')

print(f'{cross_val_score_catboost_oversampling_less_features.mean()}'
      f'±{cross_val_score_catboost_oversampling_less_features.std()}')

0.7477824217935833±0.023980377908516193
0.7577927037640589±0.012358095436621712


# Log Experiment Results 

In [22]:
auc_xgb_oversampling_helmert_less_features = roc_auc_score(
    y_test,
    pipeline_xgb_oversampling_helmert_less_features
    .predict_proba(X_test[feature_importances_catboost_oversampling_relevant])[:, 1]
)

auc_catboost_oversampling_less_features = roc_auc_score(
    y_test,
    pipeline_catboost_oversampling_less_features
    .predict_proba(
        cat_features_fillna(X_test[feature_importances_catboost_oversampling_relevant],
                            set(cat_features).intersection(feature_importances_catboost_oversampling_relevant))
    )[:, 1]
)

zip_run_info = (
    zip(['Less features, oversamplig, Hermert encoder with XGBoost', 'Less features, oversampling with CatBoost'],
        [pipeline_xgb_oversampling_helmert_less_features, pipeline_catboost_oversampling_less_features],
        ['pipeline_xgb_oversampling_helmert_less_features', 'pipeline_catboost_oversampling_less_features'],
        [auc_xgb_oversampling_helmert_less_features, auc_catboost_oversampling_less_features])
)

Trackings = [
    Tracking(
        run_name=run_name,
        tags=dict(
            model_name=name,
            target=TARGET,
            num_features_used=len(model.steps[-1][1].feature_names_
                                  if name == 'pipeline_catboost_oversampling_less_features'
                                  else model.steps[-2][1].get_feature_names()),
            top_feature_importances=(
                Counter(
                    dict(
                        zip(model.steps[-1][1].feature_names_
                            if name == 'pipeline_catboost_oversampling_less_features'
                            else model.steps[-2][1].get_feature_names(),
                            model.steps[-1][1].feature_importances_)
                    )
                ).most_common(3)
            )
        ),
        params={param: value
                for param, value in (pipeline_catboost_oversampling
                                     .steps[-1][1]
                                     .get_params()
                                     .items())
                if param != 'cat_features'},
        metrics=dict(AUC=auc),
        model=model,
        model_name=name
    )
    for run_name, model, name, auc in zip_run_info
]


run_ids_training_data_provided = (
    run_ids_training_data_provided
    + apply_runs_to_experiment(experiment_id_training_data_provided,
                               Trackings)
)

# Kaggle Submission

In [23]:
mailout_test_lnr = df_mailout_test['LNR']

df_mailout_test_base_renamed_less_features = df_mailout_test_base.copy()

df_mailout_test_base_renamed_less_features.columns = adjust_column_names_for_feature_encoding(
    df_mailout_test_base_renamed_less_features.columns
)

df_mailout_test_base_renamed_less_features_cat_fillna = cat_features_fillna(
    df_mailout_test_base_renamed_less_features,
    cat_features=set(cat_features).intersection(feature_importances_catboost_oversampling_relevant)
)

kaggle_submission(
    column_lnr=mailout_test_lnr,
    y_pred=pipeline_catboost_oversampling_less_features.predict_proba(
        df_mailout_test_base_renamed_less_features_cat_fillna[feature_importances_catboost_oversampling_relevant]
    )[:, 1],
    submission_filename='catboost_oversampling_less_features',
    submission_message='Oversampling catboost trained with columns subset of customer training data [75/15/0]'
)

100%|██████████| 1.09M/1.09M [00:06<00:00, 189kB/s]


Successfully submitted to Udacity+Arvato: Identify Customer Segments


# Third Experiment

We will train with all data Arvato provided us

Since the catboost has always worked better for us than the other algorithms we are going to train catboost with different configurations

First we have to join all data from both data sources

**Note**: Just because this is the last part of the project we can modify the column names of customers and German population because it's not used anywhere else

In [24]:
df_customers.columns = adjust_column_names_for_feature_encoding(
    df_customers.columns
)

df_german_population.columns = adjust_column_names_for_feature_encoding(
    df_german_population.columns
)

df_training_all = pd.concat(
    [df_customers[feature_importances_catboost_oversampling_relevant].assign(RESPONSE=1),
     df_german_population[feature_importances_catboost_oversampling_relevant].assign(RESPONSE=0),
     df_mailout_train_base_renamed[feature_importances_catboost_oversampling_relevant + [TARGET]]]
)

cat_features = df_training_all.select_dtypes(include=['category', 'object']).columns

features, labels = preprocessing_baseline(df_training_all,
                                          cat_features=cat_features,
                                          target=TARGET,
                                          test_size=0,
                                          valid_size=0)

X_train, _, _ = features
y_train, _, _ = labels

class_weights = (1, sum(y_train == 0) / sum(y_train == 1))

params_catboost = dict(num_trees=100,
                       max_depth=3,
                       min_child_samples=25,
                       one_hot_max_size=4,
                       class_weights=class_weights,
                       cat_features=cat_features,
                       random_state=RANDOM_STATE,
                       eval_metric='AUC',
                       verbose=0)

catboost_all_data_100 = CatBoostClassifier(**params_catboost)

params_catboost['num_trees'] = 40
catboost_all_data_40 = CatBoostClassifier(**params_catboost)

params_catboost['num_trees'] = 20
catboost_all_data_20 = CatBoostClassifier(**params_catboost)

catboost_all_data_100.fit(X_train, y_train, verbose=False)
catboost_all_data_40.fit(X_train, y_train, verbose=False)
catboost_all_data_20.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f43a9f9f780>

# Cross Validation

Again we run 4-fold cross validation to be sure about the AUC metric

In [25]:
cross_val_score_catboost_all_data_100 = cross_val_score(
    catboost_all_data_100,
    X_train,
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

cross_val_score_catboost_all_data_40 = cross_val_score(
    catboost_all_data_40,
    X_train,
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

cross_val_score_catboost_all_data_20 = cross_val_score(
    catboost_all_data_20,
    X_train,
    y_train,
    cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE),
    scoring='roc_auc',
    n_jobs=-1
)

print(f'{cross_val_score_catboost_all_data_100.mean()}'
      f'±{cross_val_score_catboost_all_data_100.std()}')

print(f'{cross_val_score_catboost_all_data_40.mean()}'
      f'±{cross_val_score_catboost_all_data_40.std()}')

print(f'{cross_val_score_catboost_all_data_20.mean()}'
      f'±{cross_val_score_catboost_all_data_20.std()}')

0.9195893124984393±0.0011780211076430536
0.9185903924815262±0.0011183835318481548
0.9170698947191096±0.001000244272439499


# Log Experiment Results 

In [26]:
auc_catboost_all_data_100 = roc_auc_score(
    y_test,
    catboost_all_data_100
    .predict_proba(
        cat_features_fillna(
            X_test[feature_importances_catboost_oversampling_relevant],
            set(cat_features).intersection(feature_importances_catboost_oversampling_relevant)
        )
    )[:, 1]
)

auc_catboost_all_data_40 = roc_auc_score(
    y_test,
    catboost_all_data_40
    .predict_proba(
        cat_features_fillna(
            X_test[feature_importances_catboost_oversampling_relevant],
            set(cat_features).intersection(feature_importances_catboost_oversampling_relevant)
        )
    )[:, 1]
)

auc_catboost_all_data_20 = roc_auc_score(
    y_test,
    catboost_all_data_20
    .predict_proba(
        cat_features_fillna(
            X_test[feature_importances_catboost_oversampling_relevant],
            set(cat_features).intersection(feature_importances_catboost_oversampling_relevant)
        )
    )[:, 1]
)

zip_run_info = (
    zip(['CatBoost all data 100 trees', 'CatBoost all data 40 trees', 'CatBoost all data 20 trees'],
        [catboost_all_data_100, catboost_all_data_40, catboost_all_data_20],
        ['catboost_all_data_100', 'catboost_all_data_40', 'catboost_all_data_20'],
        [auc_catboost_all_data_100, auc_catboost_all_data_40, auc_catboost_all_data_20])
)

Trackings = [
    Tracking(
        run_name=run_name,
        tags=dict(
            model_name=name,
            target=TARGET_CUSTOMER,
            num_features_used=len(model.feature_names_),
            top_feature_importances=(
                Counter(
                    dict(
                        zip(model.feature_names_,
                            model.feature_importances_)
                    )
                ).most_common(3)
            )
        ),
        params={param: value
                for param, value in (pipeline_catboost_oversampling
                                     .steps[-1][1]
                                     .get_params()
                                     .items())
                if param != 'cat_features'},
        metrics=dict(AUC=auc),
        model=model,
        model_name=name
    )
    for run_name, model, name, auc in zip_run_info
]

run_ids_mixed_data = (
    apply_runs_to_experiment(experiment_id_mixed_data,
                             Trackings)
)

# Kaggle Submission

# Which One Is The Best Model?

MLFlow allow us to get the best model within all combination of experiments we had run

We could try more parameters, algorithms and even data sets, but we have achieved a good result and it is important to have the model ready to be deployed

In [28]:
experiment_ids = [experiment_id_customers_vs_german_population,
                  experiment_id_training_data_provided,
                  experiment_id_mixed_data]


best_model_tracking = n_best_models_from_experiments(
    experiment_ids=experiment_ids, n=5, order_by=['metrics.AUC DESC']
)

best_model_tracking[['metrics.AUC', 'tags.mlflow.runName', 'params.num_trees', 'tags.num_features_used']]

Unnamed: 0,metrics.AUC,tags.mlflow.runName,params.num_trees,tags.num_features_used
0,0.769046,"Less features, oversampling with CatBoost",15.0,24
1,0.763107,"Less features, oversamplig, Hermert encoder wi...",15.0,1566
2,0.757247,Oversampling with CatBoost,15.0,365
3,0.753323,All possible features,,365
4,0.752057,Less features,,274


# Save Model Artifact Uri

We will use this artifact uri to make the model easy reproducible and deployable

In [29]:
best_model_artifact_uri = best_model_tracking['artifact_uri'].iloc[0]
best_model_model_name = best_model_tracking['tags.model_name'].iloc[0]

# Check MLFlow Model

In [30]:
model = load_trained_model(model_artifact_uri=best_model_artifact_uri,
                           model_model_name=best_model_model_name)

try:
    # This is model
    feature_importances_loaded_from_mlflow = dict(
        zip(model.feature_names_,
            model.feature_importances_)
    )
except AttributeError:
    # This is a pipeline
    feature_importances_loaded_from_mlflow = dict(
        zip(model.steps[-1][1].feature_names_,
            model.steps[-1][1].feature_importances_)
    )

feature_importances_loaded_from_mlflow

{'ANZ_KINDER': 0.20415072237784812,
 'D19_BANKEN_ANZ_12': 0.0837984349685739,
 'D19_BANKEN_ANZ_24': 0.4560739514707707,
 'D19_GESAMT_ONLINE_QUOTE_12': 0.9259713853611119,
 'D19_KONSUMTYP_MAX': 11.05821110969325,
 'D19_SOZIALES': 53.186382346877494,
 'D19_TELKO_REST': 1.1470836953730479,
 'EINGEFUEGT_AM': 2.2986116340657046,
 'EINGEZOGENAM_HH_JAHR': 10.646877982874827,
 'KBA05_HERST4': 2.648555184788434,
 'KBA05_KRSHERST2': 0.5631352580307166,
 'KBA05_KW2': 2.3691624662999216,
 'KBA05_MODTEMP': 1.5011674999932865,
 'KBA05_SEG1': 0.0010255933045166688,
 'KBA05_SEG2': 1.799807097504104,
 'KBA13_CCM_1800': 0.1998770352964555,
 'KBA13_KMH_110': 2.921490731720909,
 'KBA13_KMH_250': 2.878949128418511,
 'KBA13_VORB_D': 2.3630119506105243,
 'KK_KUNDENTYP': 0.0,
 'LP_LEBENSPHASE_GROB': 0.805456977952714,
 'SEMIO_FAM': 1.462451999813438,
 'SEMIO_PFLICHT': 0.0,
 'VERS_TYP': 0.47874781320380977}

# Conclusions

This was a very cool project very interesting to deal with it and to be able to apply different techniques

As future work I leave some points that would also be good to address:

- More exploration of variables and their knowledge
- Classic techniques as partial pooling
- Test other categorical encoders
- H2O models with MOJO approach to be deployable
- Sagemaker as a platform to implement and industrialize

## EXTRA (Just For Fun)

We're going to create an ensemble of all the predictions we've sent to kaggle

Bad result because we have very few sumbissions and for a good ensemble we need a lot non-correlated weak learners

In [31]:
df_ensemble_submissions = (
    pd
    .concat([pd.read_csv(file)
             for file in PATH_SUBMISSIONS.glob('*.csv')])
    .groupby('LNR')
    .agg(['mean', 'median'])
    .reset_index()
)

df_ensemble_submissions.columns = ['LNR', 'mean', 'median']

df_ensemble_submissions_mean = df_ensemble_submissions[['LNR', 'mean']].rename(columns=dict(mean=TARGET))
df_ensemble_submissions_median = df_ensemble_submissions[['LNR', 'median']].rename(columns=dict(median=TARGET))

kaggle_submission(
    column_lnr=df_ensemble_submissions_mean['LNR'],
    y_pred=df_ensemble_submissions_mean[TARGET],
    submission_filename='final_ensemble_mean',
    submission_message='Ensemble of all the Kaggle submissions with the mean'
)

kaggle_submission(
    column_lnr=df_ensemble_submissions_median['LNR'],
    y_pred=df_ensemble_submissions_median[TARGET],
    submission_filename='final_ensemble_median',
    submission_message='Ensemble of all the Kaggle submissions with the median'
)

100%|██████████| 1.04M/1.04M [00:05<00:00, 191kB/s]


Successfully submitted to Udacity+Arvato: Identify Customer Segments


100%|██████████| 1.05M/1.05M [00:06<00:00, 177kB/s]


Successfully submitted to Udacity+Arvato: Identify Customer Segments
