In [None]:
# default_exp example
# default_cls_lvl 2

In [None]:
%load_ext autotime
%load_ext autoreload
%autoreload 2

# Community Learning
> In diesem Notebook wird ein Ensemble von XGboost Modellen erstellt um die Prediction zu verbessern. 

Nachdem wir nun das Basismodel bestimmt haben. Werden wir die Daten möglichst nach Regionen aufteilen und für jede Region ein Basismodell laufen lassen. Anschliessend wird eine Vorhersage mit dem eigenen Model sowie mit dem der anderen Region erstellt um so die Vorhersagekraft weiter zu verbessern. 


## Aufteilung der Daten

Um die Daten möglichst konsitent zu teilen, werden wir für jede Kundenid den ursprünglichen Wohnort ermitteln. Wenn nun ein Kunde die Region in dem Untersuchungszeitraum wechselt, so wird nur der ursprüngliche Wohnort ausgewertet. So können wir sicherstellen, dass wir keine Daten durch die Aufteilung velieren. In dem Bild weiter unten sind die verschiedenen Regionen zu sehen. Wir werden versuchen Spanien in nördliche und südliche Regionen aufzuteilen.
![image.png](data/images/spain.png)


In [None]:
#export
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb

import community_learning.base_model as base_model

from fastscript import call_parse
from itertools import compress
from tqdm.notebook import tqdm

time: 436 ms


In [None]:
#export
region = {
"ALAVA":"north",
"ALBACETE":"south",
"ALICANTE":"south",
"ALMERIA":"south",
"ASTURIAS":"north",
"AVILA":"north",
"BADAJOZ":"south",
"BALEARS, ILLES":"north",
"BARCELONA":"north",
"BIZKAIA":"north",
"BURGOS":"north",
"CACERES":"north",
"CADIZ":"south",
"CANTABRIA":"north",
"CASTELLON":"north",
"CEUTA":"south",
"CIUDAD REAL":"south",
"CORDOBA":"south",
"CORUÑA, A":"north",
"CUENCA":"north",
"GIPUZKOA":"north",
"GIRONA":"north",
"GRANADA":"south",
"GUADALAJARA":"north",
"HUELVA":"south",
"HUESCA":"north",
"JAEN":"south",
"LEON":"north",
"LERIDA":"north",
"LUGO":"north",
"MADRID":"south",
"MALAGA":"south",
"MELILLA":"south",
"MURCIA":"south",
"NAVARRA":"north",
"OURENSE":"north",
"PALENCIA":"north",
"PALMAS, LAS":"north",
"PONTEVEDRA":"north",
"RIOJA, LA":"north",
"SALAMANCA":"north",
"SANTA CRUZ DE TENERIFE":"north",
"SEGOVIA":"north",
"SEVILLA":"south",
"SORIA":"north",
"TARRAGONA":"north",
"TERUEL":"north",
"TOLEDO":"north",
"UNKNOWN":"north",
"VALENCIA":"north",
"VALLADOLID":"north",
"ZAMORA":"north",
"ZARAGOZA":"north"
}

time: 7.51 ms


In [None]:
#export 
def load_provice_data(path:str='data/raw/train_ver2.csv'):
    """laden der Rohdaten für die Ermittlung der Aufteilung"""
    reader = pd.read_csv(path, chunksize=100000, header=0, usecols=['ncodpers', 'nomprov'])
    train = pd.concat([chunk for chunk in reader])
    return train


time: 9.02 ms


In [None]:
data = load_provice_data()

time: 9.49 s


In [None]:
assert 'ncodpers' in data
assert 'nomprov' in data

time: 7.96 ms


In [None]:
#export
def add_region_to_nomprov(df:pd.DataFrame):
    """add a region column"""
    df = df.groupby(by='ncodpers').first()
    df['region'] = df['nomprov'].map(lambda x: region.get(x, '----'))
    return df

time: 18.7 ms


In [None]:
data1 = add_region_to_nomprov(data)
assert data['ncodpers'].unique().shape[0] == data1.shape[0]
data1.shape[0]

956645

time: 1.18 s


In [None]:
#export
def load_data(path_train='data/interim/03_train.csv',
              path_test='data/interim/03_test.csv'):
    """load data"""
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)
    return (train, test)    

time: 15.1 ms


In [None]:
train_org, test = load_data()

time: 2.23 s


In [None]:
#export 
def add_region_train_test(train:pd.DataFrame, 
                    test:pd.DataFrame,
                    region_df:pd.DataFrame):
    """split train and test data by region"""
    train = train.merge(region_df, left_on='id', right_on='ncodpers', how='left')
    test = test.merge(region_df, left_on='id', right_on='ncodpers', how='left')
    return (train, test)

time: 8.17 ms


In [None]:
pd.set_option('display.max_columns', None)
train, test = add_region_train_test(train_org, test, data1)
len(train_org) == len(train)

True

time: 464 ms


In [None]:
south_ids = train.loc[train.region=='south',]['id'].unique().tolist()
north_ids = train.loc[train.region=='north',]['id'].unique().tolist()
assert all([ True if south_id not in north_ids else False for south_id in tqdm(south_ids) ])

HBox(children=(FloatProgress(value=0.0, max=97028.0), HTML(value='')))


time: 46.5 s


In [None]:
product_dict = base_model.get_product_dict(train_org)
product_reverse_dict = base_model.get_product_reverse_dict(train_org)    
train = base_model.encode_products(train_org)

time: 139 ms


In [None]:
region_df = load_provice_data()
region_df = add_region_to_nomprov(region_df)

time: 9.78 s


In [None]:
train1, test1 = add_region_train_test(train, test, region_df)

train_X_south, train_y_south = base_model.x_y_split(train1.loc[train1.region == 'south',].copy().reset_index())
train_X_north, train_y_north = base_model.x_y_split(train1.loc[train1.region == 'north',].copy().reset_index())
assert len(train_X_south.groupby(by='region')) == 1
assert len(train_X_north.groupby(by='region')) == 1
assert len(train_X_south) + len(train_X_north) == len(train)

time: 795 ms


In [None]:
model_south = base_model.runXGB(train_X_south, train_y_south, base_model.feature_cols, use_gpu=True)
model_north = base_model.runXGB(train_X_north, train_y_north, base_model.feature_cols, use_gpu=True)

time: 19.1 s


In [None]:
#export
def get_two_region_data(source_train:str='data/interim/03_train.csv',
                               source_test:str='data/interim/03_test.csv',
                               source_raw:str='data/raw/train_ver2.csv'):
    """returns X, y data for each region"""
    data = dict()
    
    print('load data')
    train_org, test = load_data(source_train, source_test)    
    print('prepare data')
    data['product_dict'] = base_model.get_product_dict(train_org)
    data['product_reverse_dict'] = base_model.get_product_reverse_dict(train_org)    
    train = base_model.encode_products(train_org)
    region_df = load_provice_data(source_raw)
    region_df = add_region_to_nomprov(region_df)
    
    data['feature_cols'], data['target_cols'] = base_model.feature_cols, base_model.target_cols
    
    train, test = add_region_train_test(train, test, region_df)
    data['train'], data['test'] = train, test
    data['train_south'] = train.loc[train.region == 'south',]
    data['train_north'] = train.loc[train.region == 'north',]
    
    data['train_X_south'], data['train_y_south'] = base_model.x_y_split(train.loc[train.region == 'south',])
    data['train_X_north'], data['train_y_north'] = base_model.x_y_split(train.loc[train.region == 'north',])
    data['train_X'], data['train_y'] = base_model.x_y_split(train)
    
    
    test_south = test.loc[test.region == 'south']
    data['test_south'] = test_south.reset_index(drop=True)
    
    test_north = test.loc[test.region == 'north']
    data['test_north'] = test_north.reset_index(drop=True)
    
    return data

time: 15.8 ms


In [None]:
data = get_two_region_data()

load data
prepare data
time: 18.6 s


In [None]:
data.keys()

dict_keys(['product_dict', 'product_reverse_dict', 'feature_cols', 'target_cols', 'train', 'test', 'train_south', 'train_north', 'train_X_south', 'train_y_south', 'train_X_north', 'train_y_north', 'train_X', 'train_y', 'test_south', 'test_north'])

time: 8.48 ms


## Simple Ensemble

In [None]:
#export
def get_two_region_base_models(data:dict,
                               dest_model_south:str='data/results/model_south.dat',
                               dest_model_north:str='data/results/model_north.dat',
                               use_gpu=False):
    """load data, assign region, and train model per region"""
    
    
    print('train model south')
    model_south = base_model.runXGB(
        data['train_X_south'], 
        data['train_y_south'],
        data['feature_cols'], 
        use_gpu=use_gpu)
    
    print('train model north')
    model_north = base_model.runXGB(
        data['train_X_north'], 
        data['train_y_north'], 
        data['feature_cols'], 
        use_gpu=use_gpu)
    
    print('train model all')
    model_all = base_model.runXGB(
        data['train_X'], 
        data['train_y'], 
        data['feature_cols'], 
        use_gpu=use_gpu)
    
    models = {}
    models['model_south'], models['model_north'], models['model_all'] = model_south, model_north, model_all
    #pickle.dump(model_south, open(dest_model_south, 'wb'))
    #pickle.dump(model_north, open(dest_model_north, 'wb'))
    
    return models

time: 18.5 ms


In [None]:
models = get_two_region_base_models(data, use_gpu=True)

train model south
train model north
train model all
time: 1min 21s


In [None]:
#export
def get_prediction(model, data, feature_cols):
    """returns the results for two region model"""
    xgtest = xgb.DMatrix(data[feature_cols])    
    return model.predict(xgtest)

time: 7.76 ms


In [None]:
preds = get_prediction(models['model_south'], data['test_south'], data['feature_cols'])
assert data['test_south'][data['target_cols']].shape[0] == preds.shape[0]

time: 4.25 s


In [None]:
#export
def evaluate_predictions(preds, test_data, target_cols, product_reverse_dict):
    """evaluates map metric"""
    preds = np.argsort(preds, axis=1)
    preds = np.fliplr(preds)[:,:7]
    preds = pd.DataFrame(preds)
    preds = preds.applymap(lambda x: product_reverse_dict[x])
    preds['added_products'] = preds.apply(lambda x: list(x.values), axis=1)
    preds = preds['added_products']
    
    test_data['added_products'] = preds
    test_data['truth_list'] = test_data[target_cols].apply(lambda x: list(compress(target_cols, x.values)), axis=1)
    test_data['apk'] = test_data.apply(lambda x: base_model.apk(x['truth_list'], x['added_products']),axis=1)
    #print(f"mean average precision = {test_data['apk'].mean()}")
    return test_data['apk'].mean()

time: 10.9 ms


In [None]:
evaluate_predictions(preds, data['test_south'], data['target_cols'], data['product_reverse_dict'])

0.0267273495497508

time: 29.6 s


In [None]:
data = get_two_region_data()
models = get_two_region_base_models(data, use_gpu=True)

load data
prepare data
train model south
train model north
time: 34.4 s


In [None]:
data['test']

Unnamed: 0,id,ind_empleado,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1,fecha_dato_month,fecha_dato_year,month_int,fecha_alta_month,fecha_alta_year,fecha_alta_day,fecha_alta_month_int,fecha_alta_day_int,ult_fec_cli_1t_month,ult_fec_cli_1t_year,ult_fec_cli_1t_day,ult_fec_cli_1t_month_int,id_shift,ind_cco_fin_ult1_s,ind_cder_fin_ult1_s,ind_cno_fin_ult1_s,ind_ctju_fin_ult1_s,ind_ctma_fin_ult1_s,ind_ctop_fin_ult1_s,ind_ctpp_fin_ult1_s,ind_deco_fin_ult1_s,ind_deme_fin_ult1_s,ind_dela_fin_ult1_s,ind_ecue_fin_ult1_s,ind_fond_fin_ult1_s,ind_hip_fin_ult1_s,ind_plan_fin_ult1_s,ind_pres_fin_ult1_s,ind_reca_fin_ult1_s,ind_tjcr_fin_ult1_s,ind_valo_fin_ult1_s,ind_viv_fin_ult1_s,ind_nomina_ult1_s,ind_nom_pens_ult1_s,ind_recibo_ult1_s,nomprov,region,added_products,truth_list
0,15889,3,0,0,56,0,255,1,1,0,1,0,0,5,0,1,28,1,326124,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,1,17,1,0,16,1,46,1,5,1,61,15889,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,MADRID,south,"[ind_recibo_ult1, ind_tjcr_fin_ult1, ind_fond_...",[ind_tjcr_fin_ult1]
1,15890,1,0,0,63,0,256,1,1,0,1,0,0,5,0,1,28,1,71461,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,1,0,16,1,46,1,5,1,61,15890,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,1,MADRID,south,"[ind_cco_fin_ult1, ind_dela_fin_ult1, ind_reca...",[]
2,15892,3,0,1,62,0,256,1,1,0,1,0,0,5,0,1,28,1,430477,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,1,0,16,1,46,1,5,1,61,15892,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0,0,1,MADRID,south,"[ind_nom_pens_ult1, ind_ctop_fin_ult1, ind_nom...",[]
3,15893,0,0,0,63,0,256,1,1,0,1,0,0,5,0,1,28,1,430477,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,10,2,3,34,1033,1,5,1,61,15893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,MADRID,south,"[ind_cco_fin_ult1, ind_tjcr_fin_ult1, ind_reci...",[]
4,15894,1,0,0,60,0,256,1,1,0,1,0,0,5,0,1,28,1,281757,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,1,0,16,1,46,1,5,1,61,15894,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,1,1,1,MADRID,south,"[ind_cno_fin_ult1, ind_fond_fin_ult1, ind_valo...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702430,1454615,0,0,0,46,0,8,1,1,0,1,0,-1,21,0,1,8,1,75445,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,9,20,18,249,7588,1,5,1,61,1454615,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,BARCELONA,north,,[]
702431,1454616,0,0,1,21,0,8,1,1,0,1,0,-1,157,0,1,9,1,132889,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,9,20,18,249,7588,1,5,1,61,1454616,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,BURGOS,north,,[]
702432,1454617,0,0,1,21,0,8,1,1,0,1,0,-1,157,0,1,11,1,58476,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,9,20,18,249,7588,1,5,1,61,1454617,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,CADIZ,south,,[]
702433,1454618,0,0,0,20,0,8,1,1,1,1,0,-1,157,0,1,28,0,75904,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1,17,9,20,18,249,7588,1,5,1,61,1454618,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,MADRID,south,,[]


time: 52.6 ms


In [None]:
results = {'model_south':[1,1,1], 'model_north':[1,1,1], 'model_all':[1,1,1], 'model_south_north_combine':[1,1,1]} 
  
# Creates pandas DataFrame. 
results_df = pd.DataFrame(results, index = ['test_south', 'test_north', 'test']) 
results_df

Unnamed: 0,model_south,model_north,model_all,model_south_north_combine
test_south,1,1,1,1
test_north,1,1,1,1
test,1,1,1,1


time: 24.9 ms


In [None]:
#export
def fill_results(model_label, data_label, data, models, results_df):
    model = models[model_label]
    dat = data[data_label].copy()
    preds = get_prediction(model, dat, data['feature_cols'])
    result = evaluate_predictions(preds, dat, data['target_cols'], data['product_reverse_dict'])
    results_df.loc[data_label, model_label] = result
    return results_df

time: 8.34 ms


In [None]:
def fill_results_ensemble(model1_label, model2_label, data_label, data, models, results_df):
    model1 = models[model1_label]
    model2 = models[model2_label]
    dat = data[data_label].copy()
    preds1 = get_prediction(model1, dat, data['feature_cols'])
    preds2 = get_prediction(model2, dat, data['feature_cols'])
    preds = preds1 + preds2
    result = evaluate_predictions(preds, dat, data['target_cols'], data['product_reverse_dict'])
    results_df.loc[data_label, 'model_south_north_combine'] = result
    return results_df

time: 20.2 ms


In [None]:
results_df = fill_results_ensemble('model_south', 'model_north', 'test_south', data, models, results_df)
print(results_df)

results_df = fill_results_ensemble('model_south', 'model_north', 'test_north', data, models, results_df)
print(results_df)

results_df = fill_results_ensemble('model_south', 'model_north', 'test', data, models, results_df)
print(results_df)


            model_south  model_north  model_all  model_south_north_combine
test_south            1            1          1                   0.026649
test_north            1            1          1                   1.000000
test                  1            1          1                   1.000000
            model_south  model_north  model_all  model_south_north_combine
test_south            1            1          1                   0.026649
test_north            1            1          1                   0.018006
test                  1            1          1                   1.000000
            model_south  model_north  model_all  model_south_north_combine
test_south            1            1          1                   0.026649
test_north            1            1          1                   0.018006
test                  1            1          1                   0.022984
time: 2min 13s


In [None]:
results_df = fill_results('model_south', 'test_south', data, models, results_df)
print(results_df)
results_df = fill_results('model_south', 'test_north', data, models, results_df)
print(results_df)
results_df = fill_results('model_south', 'test', data, models, results_df)
print(results_df)


results_df = fill_results('model_north', 'test_south', data, models, results_df)
print(results_df)
results_df = fill_results('model_north', 'test_north', data, models, results_df)
print(results_df)
results_df = fill_results('model_north', 'test', data, models, results_df)
print(results_df)


results_df = fill_results('model_all', 'test_south', data, models, results_df)
print(results_df)
results_df = fill_results('model_all', 'test_north', data, models, results_df)
print(results_df)
results_df = fill_results('model_all', 'test', data, models, results_df)
print(results_df)

            model_south  model_north  model_all  model_south_north_combine
test_south     0.026727            1          1                   0.026649
test_north     1.000000            1          1                   0.018006
test           1.000000            1          1                   0.022984
            model_south  model_north  model_all  model_south_north_combine
test_south     0.026727            1          1                   0.026649
test_north     0.017951            1          1                   0.018006
test           1.000000            1          1                   0.022984
            model_south  model_north  model_all  model_south_north_combine
test_south     0.026727            1          1                   0.026649
test_north     0.017951            1          1                   0.018006
test           0.023006            1          1                   0.022984
            model_south  model_north  model_all  model_south_north_combine
test_south     0.026727  

In [None]:
results_df['distributed_lgbm_training'] = [0.026443885421329187, 0.017835593289980797, 0.022793803380779395]
results_df.to_csv('data/final/results_df.csv')
results_df

Unnamed: 0,model_south,model_north,model_all,model_south_north_combine,distributed_lgbm_training
test_south,0.026727,0.026502,0.02669,0.026649,0.026444
test_north,0.017951,0.018017,0.017983,0.018006,0.017836
test,0.023006,0.022904,0.022998,0.022984,0.022794


time: 13.7 ms


## Export

In [None]:
from nbdev.export import *
notebook2script()

Converted 01_data_preprocess.ipynb.
Converted 02_data_Cleaning.ipynb.
Converted 03_features.ipynb.
Converted 04_base_model - Versuch CCA.ipynb.
Converted 04_base_model.ipynb.
Converted 05_xgboost_simple_ensemble.ipynb.
This cell doesn't have an export destination and was ignored:
e
Converted 06_Distributed_ML.ipynb.
Converted index.ipynb.
time: 116 ms
