In [None]:
# default_exp base_model
# default_cls_lvl 2

In [None]:
%load_ext autotime
%load_ext autoreload
%autoreload 2

# Base model
> Dieses Modul stellt die Wesentlichen Funktionen für Training, Prediction und Evaluation bereit. 

Das Ziel dieser Kaggle Challenge war es die neuen Produkte für die Periode 2016-06-28 vorherzusagen. Dazu haben sie das Datenset in ein Testset (Grunddaten von der Periode 2016-06-28) und Trainingset (Daten von 2015-01-28 bis 2016-05-28) aufgeteilt. Leider kennen wir die wahren Werte von der Periode 2016-06-28 nicht, weshalb wir diesen Datenpunk ignorieren werden und Train und Testset wie folgt aufteilen werden. 

![image.png](docs/images/train_test.png)

In [None]:
#export
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_validate
from community_learning.features import target_cols
from fastscript import *
from tqdm.notebook import tqdm
from itertools import compress

time: 625 ms


In [None]:
#export
def load_data(path='data/interim/03_train.csv'):
    """load data"""
    return pd.read_csv(path)

time: 8.5 ms


In [None]:
train = load_data().sample(1000)

time: 26.7 s


## Train - Test Split

In [None]:
#export 
def train_test_split(df:pd.DataFrame):
    """split the data in a training and testset"""
    test_data = df[df.month_int == 17].copy()
    train_data = df[df.month_int < 17].copy()
    return (train_data, test_data)


time: 9.23 ms


In [None]:
train_data, test_data = train_test_split(train)
assert (train_data.month_int != 17).all()
assert (test_data.month_int == 17).all()

time: 13.4 ms


In [None]:
#export
def x_y_split(df:pd.DataFrame, target_cols=target_cols):
    """returns 2 dataframes for X and Y variables"""
    X = df.drop(target_cols, axis=1)
    Y = df[target_cols].copy()
    return (X, Y)

time: 14.2 ms


In [None]:
X_train, Y_train = x_y_split(train_data, target_cols)
X_test, Y_test = x_y_split(test_data, target_cols)

assert set(X_train.columns) != set(target_cols)
assert set(Y_train.columns) == set(target_cols)
assert set(X_test.columns) != set(target_cols)
assert set(Y_test.columns) == set(target_cols)

time: 14.6 ms


## XGB functions

In [None]:
#export
def runXGB(train_X, train_y, test_X, target_col, seed_val=0):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.05 # learning rate
    param['max_depth'] = 8
    #param['silent'] = 0
    #param['num_class'] = 22
    param['eval_metric'] = "logloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = 50

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=np.array(train_y[target_col]))
    model = xgb.train(plst, xgtrain, num_rounds)
    y_pred = model.predict(xgb.DMatrix(X_test))
    result_xgb_df = pd.DataFrame(index=test_X.id, columns=['pred_' + target_col], data=y_pred)
    result_xgb_df.reset_index(inplace=True)
    return result_xgb_df

time: 12.4 ms


In [None]:
result_xgb_df = runXGB(X_train.copy(), Y_train.copy(), X_test.copy(), target_cols[4])

time: 12.8 s


In [None]:
assert len(result_xgb_df) == len(X_test)

time: 11.6 ms


In [None]:
#export 
def predict_all_products(train_X, train_y, test_X, target_col):
    """create a model for each product and return a DataFrame with all predictions"""
    
    result_xgb = pd.DataFrame(test_X[['id']])
    
    for col in tqdm(target_col):
        result_xgb_df = runXGB(X_train.copy(), Y_train.copy(), X_test.copy(), col)
        result_xgb['pred_' + col] = result_xgb_df['pred_' + col].values
    
    result_xgb.drop('id', axis=1, inplace=True)
    return result_xgb

time: 10.6 ms


In [None]:
results = predict_all_products(X_train, Y_train, X_test, target_cols)

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))


time: 21.9 s


In [None]:
assert len(Y_train.columns) == len(results.columns)

time: 9.46 ms


## Evaluate Results

Für die [Evaluierung](https://www.kaggle.com/c/santander-product-recommendation/overview/evaluation) wird der Mean Average Precision @ 7 (MAP@7) hergenommen. Die unten stehenden Formel haben wir uns von [jturkewitz](https://github.com/jturkewitz/SideProjects/blob/4c437b02d5e017636c84cc22eb3ff71f8eea1308/Kaggle/Santander_Prod/santander_prod.py#L272) ausgeliehen.

$$
MAP@7 =  \dfrac{1} {\vert U \vert} \sum^{\vert U \vert}_{u=1} \dfrac {1} {min(m,7)} \sum^{min(n,7)}_{k=1} P(k)
$$

In [None]:
#export
def apk(actual, predicted, k=7):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

#%%
def get_top7_preds_string(row):
    """return top 7 column names"""
    row.sort_values(inplace=True)
    return row.index[-7:][::-1].tolist()

time: 10.1 ms


In [None]:
len(results)

56

time: 17 ms


In [None]:
#export 
def get_results(results:pd.DataFrame, Y_test:pd.DataFrame,  target_cols:list):
    """"""
    pred_cols = []
    for col in target_cols:
        name = 'pred_' + col
        pred_cols.append('pred_' + col)

    results['added_products'] = results[pred_cols].apply(lambda row: get_top7_preds_string(row), axis=1)
    results['added_products'] = results['added_products'].map(lambda x: [x[5:] for x in x]) #remove pred_ prefix
    results['truth_list'] = Y_test[target_cols].apply(lambda x: list(compress(target_cols, x.values)), axis=1)
    results['apk'] = results.apply(lambda x: apk(x['truth_list'],x['added_products']),axis=1)
    return results['apk'].mean()


time: 10 ms


In [None]:
get_results(results, Y_test, target_cols)

0.0

time: 30 ms


In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(results['truth_list'])
pd.set_option("display.max_rows", 50, "display.max_columns", 50)

3870944     []
1129551     []
4725151     []
7062525     []
6301145     []
1888785     []
9608704     []
7613810     []
9777585     []
1618516     []
963294      []
4214997     []
5470676     []
6548175     []
6902376     []
7827475     []
6474756     []
7578104     []
9867207     []
2332207     []
10075653    []
9965412     []
3445539     []
6107015     []
654088      []
364135      []
7880182     []
8289804     []
4113126     []
6322435     []
1564061     []
474955      []
6133955     []
3655053     []
6417222     []
4828190     []
8873971     []
9864507     []
4235603     []
9329153     []
167735      []
7724527     []
4571223     []
948473      []
979381      []
5612213     []
9068731     []
2261127     []
5063819     []
9119919     []
907675      []
9843746     []
2399617     []
3503968     []
1233984     []
6746297     []
Name: truth_list, dtype: object


KeyError: 'truth_list'

time: 125 ms


In [None]:
results['added_products'].iloc[1]

['ind_recibo_ult1',
 'ind_aval_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_tjcr_fin_ult1',
 'ind_cco_fin_ult1',
 'ind_plan_fin_ult1']

time: 12.9 ms


In [None]:
@call_parse
def get_base_model_results():
    """"""
    pass

time: 25.1 ms


In [None]:
from nbdev.export import *
notebook2script()

Converted 01_data_preprocess.ipynb.
Converted 02_data_Cleaning.ipynb.
Converted 03_target_vars.ipynb.
Converted 04_base_model.ipynb.
Converted index.ipynb.
time: 50.9 ms
