In [1]:
import torch 
import numpy as np
import pandas as pd
import torch.nn 
import matplotlib.pyplot as plt
import warnings
from pd.nn.model import Conv

from pd.metric import amex_metric
from pd.data.loader import CustomerData, DataLoader
from pd.params import *
from pd.pred import pred_test_npy
from sklearn.model_selection import train_test_split


In [2]:
train_data = pd.read_parquet(DATADIR+"train_data.parquet")
train_labels = pd.read_csv(DATADIR+"train_labels.csv")
train_labels.set_index("customer_ID", inplace=True)


In [3]:
train_customers = train_data.customer_ID
train_count =  train_customers.value_counts()
train_c13 = train_count[train_count==13].index
train_data = train_data[train_data.customer_ID.isin(train_c13)]


In [25]:
def get_customer_data(customer_ids, train_data, train_labels=None, test_mode=False, normalize=True):
    cols = featureCols
    # fill nan with mean of each columns 
    for c in cols:
        train_data[c] = train_data[c].fillna(col_info13[c]["mean"])
        if normalize:
            if c in ContCols:
                train_data[c] = (train_data[c] - col_info13[c]["q1"])/ (col_info13[c]["q99"] - col_info13[c]["q1"])
        
    customer_data = train_data.groupby("customer_ID")
    labels_array = np.zeros((len(set(customer_ids)) ,1))
    id_dict = {}
    d = np.zeros((len(set(customer_ids)), 13, len(cols)), dtype=np.float32) # init with zeros
    for idx, c in enumerate(set(customer_ids)):
        cd = customer_data.get_group(c)[cols].values
        num_data_point = cd.shape[0]
        d[idx, -num_data_point:, :] = cd
        id_dict[idx] = c
        if not test_mode:
            label = train_labels.loc[c]
            labels_array[idx] = label
    
    return d, labels_array, id_dict

d, labels_array, id_dict = get_customer_data(train_c13, train_data, train_labels=train_labels)

In [38]:
np.save(OUTDIR+"c13_data.npy", d)
np.save(OUTDIR+"c13_labels.npy", labels_array)

In [20]:
nzs = []
dist_col_27 = []
for c in col_info.keys():
    nz = np.count_nonzero(col_info[c]["hist"][0])
    nzs.append(nz)
    if nz < 27:
        dist_col_27.append(c)

In [21]:
def get_customer_data(customer_ids, customer_data, cols, train_labels=None, test_mode=False):
    d = np.zeros((len(set(customer_ids)), 13, len(cols)), dtype=np.float32)

    labels_array = np.zeros((len(set(customer_ids)) ,1))
    id_dict = {}

    for idx, c in enumerate(set(customer_ids)):
        cd = customer_data.get_group(c)[cols].values
        num_data_point = cd.shape[0]
        d[idx, -num_data_point:, :] = cd
        id_dict[idx] = c
        if not test_mode:
            label = train_labels.loc[c]
            labels_array[idx] = label
    
    return d, labels_array

In [22]:
train_data, train_labels = get_customer_data(train_data.customer_ID.unique(), train_data.groupby("customer_ID"), cols=dist_col_27, train_labels=train_labels)

In [31]:
train_data = np.nan_to_num(train_data)
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=1/9, random_state=0, shuffle=True)
validation_data = (X_test, y_test)


In [None]:
np.

In [32]:
train_dataset = CustomerData(X_train, train_labels=y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


## Train with Conv Model

In [33]:
from pd.nn.train_utils import train_torch_model
model_name = "Conv_col27"
model = Conv(input_dim=91)
model = train_torch_model(model, train_loader, num_epochs=100, validation_data=validation_data, 
                            output_model_name=model_name)


0, BCE loss: 0.845, amex train: 0.013, val 0.000
0, BCE loss: 0.787, amex train: 0.105, val 0.000
0, BCE loss: 0.745, amex train: 0.132, val 0.000
0, BCE loss: 0.625, amex train: 0.154, val 0.000
0, BCE loss: 0.550, amex train: 0.191, val 0.000
0, BCE loss: 0.561, amex train: 0.204, val 0.000
0, BCE loss: 0.591, amex train: 0.222, val 0.000
0, BCE loss: 0.596, amex train: 0.246, val 0.000
0, BCE loss: 0.578, amex train: 0.242, val 0.000
0, BCE loss: 0.561, amex train: 0.240, val 0.000
0, BCE loss: 0.549, amex train: 0.241, val 0.000
0, BCE loss: 0.553, amex train: 0.244, val 0.000
0, BCE loss: 0.548, amex train: 0.245, val 0.000
0, BCE loss: 0.552, amex train: 0.251, val 0.000
0, BCE loss: 0.562, amex train: 0.257, val 0.000
0, BCE loss: 0.567, amex train: 0.255, val 0.000
0, BCE loss: 0.551, amex train: 0.259, val 0.000
0, BCE loss: 0.555, amex train: 0.257, val 0.000
0, BCE loss: 0.547, amex train: 0.265, val 0.000
0, BCE loss: 0.532, amex train: 0.269, val 0.000
0, BCE loss: 0.537, 

KeyboardInterrupt: 

## Train with aggregation over the features

### Mean aggregation

In [34]:
from bes.nn.es_module import ESModule
import torch 
import torch.nn as nn
import torch.nn.functional as F

class MLP(ESModule):

    def __init__(self, input_dim, hidden_dim=128,):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.nf1 = nn.LayerNorm([hidden_dim])
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf2 = nn.LayerNorm([hidden_dim])
        self.fc3 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf3 = nn.LayerNorm([hidden_dim])
        self.fc4 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf4 = nn.LayerNorm([hidden_dim])
        self.fc5 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf5 = nn.LayerNorm([hidden_dim])
        
        self.fcout = nn.Linear(in_features=hidden_dim, out_features=1)
    
    def forward(self, h, return_featues=False):
        h = F.selu(self.fc1(h))
        r = self.nf1(h)
        h = F.selu(self.fc2(r))
        h = self.nf2(h)
        h = F.selu(self.fc3(h))
        r = self.nf3(h+r)
        h = F.selu(self.fc4(r))
        h = self.nf4(h)
        h = F.selu(self.fc5(h))
        h = self.nf5(h+r)
        if return_featues:
            return torch.sigmoid(self.fcout(h)), h
        
        return torch.sigmoid(self.fcout(h))



In [42]:
# take the sum of the of the data and normalize by the max vals 
X = train_data.sum(axis=1) 
for idx, c in enumerate(dist_col_27):
    X[:, idx] = X[:, idx]/col_info[c]["max"]

X = X/X.max(axis=0)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, train_labels, test_size=1/9, random_state=0, shuffle=True)
validation_data = (X_test, y_test)


In [60]:
train_dataset = CustomerData(X_train, train_labels=y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


In [61]:
model_name = "mlp_col27_sum"
model = MLP(input_dim=91)
model = train_torch_model(model, train_loader, num_epochs=100, validation_data=validation_data, 
                            output_model_name=model_name)


0, BCE loss: 0.660, amex train: -0.043, val 0.000
0, BCE loss: 0.618, amex train: 0.281, val 0.000
0, BCE loss: 0.586, amex train: 0.452, val 0.000
0, BCE loss: 0.493, amex train: 0.490, val 0.000
0, BCE loss: 0.449, amex train: 0.490, val 0.000
0, BCE loss: 0.468, amex train: 0.505, val 0.000
0, BCE loss: 0.453, amex train: 0.525, val 0.000
0, BCE loss: 0.414, amex train: 0.529, val 0.000
0, BCE loss: 0.385, amex train: 0.536, val 0.000
0, BCE loss: 0.383, amex train: 0.567, val 0.000
0, BCE loss: 0.388, amex train: 0.553, val 0.000
0, BCE loss: 0.377, amex train: 0.570, val 0.000
0, BCE loss: 0.348, amex train: 0.588, val 0.000
0, BCE loss: 0.355, amex train: 0.580, val 0.000
0, BCE loss: 0.361, amex train: 0.585, val 0.000
0, BCE loss: 0.373, amex train: 0.590, val 0.000
0, BCE loss: 0.348, amex train: 0.607, val 0.000
0, BCE loss: 0.345, amex train: 0.608, val 0.000
0, BCE loss: 0.343, amex train: 0.612, val 0.000
0, BCE loss: 0.347, amex train: 0.605, val 0.000
0, BCE loss: 0.346,

## Train a model with only c27 (extract a feature for them)

## Aggregate Model with pre-trained conv

In [62]:
model_name = "conv_90_780_18_5"
conv = Conv()
model_param = torch.load(OUTDIR+model_name)
conv.load_state_dict(model_param)


<All keys matched successfully>

In [64]:
pred, conv_feat =  conv(torch.as_tensor(np.load(OUTDIR+"train_data_all.npy"), dtype=torch.float32), return_featues=True)

In [13]:
3.40+11.70+13.10+1.69+6.10

35.99

In [4]:
X = np.load(OUTDIR+"agg_feat.npy")

In [5]:
train_labels = np.load(OUTDIR+"train_labels_all.npy")
    

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, train_labels, test_size=1/9, random_state=0, shuffle=True)
validation_data = (X_test, y_test)

train_dataset = CustomerData(X_train, train_labels=y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

    

In [6]:
from pd.nn.train_utils import train_torch_model
from bes.nn.es_module import ESModule
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [7]:

class MLP(ESModule):

    def __init__(self, input_dim, hidden_dim=128,):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.nf1 = nn.LayerNorm([hidden_dim])
        
        self.fcout = nn.Linear(in_features=hidden_dim, out_features=1)
    
    def forward(self, h, return_featues=False):
        h = F.selu(self.fc1(h))
        h = self.nf1(h)
        if return_featues:
            return torch.sigmoid(self.fcout(h)), h
        
        return torch.sigmoid(self.fcout(h))

In [8]:
class MLP(ESModule):

    def __init__(self, input_dim, hidden_dim=128,):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.nf1 = nn.LayerNorm([hidden_dim])
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf2 = nn.LayerNorm([hidden_dim])
        self.fc3 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf3 = nn.LayerNorm([hidden_dim])
        self.fc4 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf4 = nn.LayerNorm([hidden_dim])
        self.fc5 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.nf5 = nn.LayerNorm([hidden_dim])
        
        self.fcout = nn.Linear(in_features=hidden_dim, out_features=1)
    
    def forward(self, h, return_featues=False):
        h = F.selu(self.fc1(h))
        r = self.nf1(h)
        h = F.selu(self.fc2(r))
        h = self.nf2(h)
        h = F.selu(self.fc3(h))
        r = self.nf3(h+r)
        h = F.selu(self.fc4(r))
        h = self.nf4(h)
        h = F.selu(self.fc5(h))
        h = self.nf5(h+r)
        if return_featues:
            return torch.sigmoid(self.fcout(h)), h
        
        return torch.sigmoid(self.fcout(h))


In [9]:
model = MLP(input_dim=X.shape[-1])
optimizer = torch.optim.Adam(model.parameters(),)
criterion = torch.nn.BCELoss()


for epoch in range(50): 
    for idx, (feat, clabel) in enumerate(train_loader):
        if len(feat.shape) == 4:  ## Reduce shape if its coming from a ratio version of the loader
            feat = feat.squeeze(dim=0)
            clabel = clabel.squeeze(dim=0)

        pred = model(feat)
        #weight = clabel.clone()
        #weight[weight==0] = 4
        #criterion.weight = weight
        loss = criterion(pred, clabel)
        
        # Update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        model_metric = amex_metric(clabel.detach().numpy(), pred.detach().numpy())
        val_metrix = 0
        if model_metric > 0.78:
            X_test, y_test = validation_data
            val_features = torch.as_tensor(X_test, dtype=torch.float32)
            val_pred = model(val_features)
            val_metrix = amex_metric(y_test, val_pred.detach().numpy())

        log_message = f"{epoch}, BCE loss: {loss.item():.3f}, amex train: {model_metric:.3f}, val {val_metrix:.3f}"
        print(log_message)    

0, BCE loss: 0.887, amex train: -0.070, val 0.000
0, BCE loss: 0.457, amex train: 0.719, val 0.000
0, BCE loss: 0.343, amex train: 0.772, val 0.000
0, BCE loss: 0.239, amex train: 0.791, val 0.772
0, BCE loss: 0.262, amex train: 0.791, val 0.772
0, BCE loss: 0.294, amex train: 0.772, val 0.000
0, BCE loss: 0.259, amex train: 0.785, val 0.775
0, BCE loss: 0.233, amex train: 0.783, val 0.774
0, BCE loss: 0.225, amex train: 0.789, val 0.774
0, BCE loss: 0.251, amex train: 0.769, val 0.000
0, BCE loss: 0.267, amex train: 0.774, val 0.000
0, BCE loss: 0.252, amex train: 0.781, val 0.776
0, BCE loss: 0.227, amex train: 0.793, val 0.776
0, BCE loss: 0.228, amex train: 0.781, val 0.777
0, BCE loss: 0.221, amex train: 0.801, val 0.777
0, BCE loss: 0.236, amex train: 0.784, val 0.777
0, BCE loss: 0.237, amex train: 0.791, val 0.777
0, BCE loss: 0.235, amex train: 0.778, val 0.000
0, BCE loss: 0.225, amex train: 0.792, val 0.777
0, BCE loss: 0.223, amex train: 0.786, val 0.777
0, BCE loss: 0.224,

In [10]:
import lightgbm as lgb


In [7]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True


In [8]:
params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }

In [9]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_test, y_test,)
model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = lgb_amex_metric
            )
        

  if array.dtype == dtype:


[LightGBM] [Info] Number of positive: 105606, number of negative: 302316
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65790
[LightGBM] [Info] Number of data points in the train set: 407922, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258888 -> initscore=-1.051758
[LightGBM] [Info] Start training from score -1.051758




[500]	training's binary_logloss: 0.31715	training's amex_metric: 0.797913	valid_1's binary_logloss: 0.323027	valid_1's amex_metric: 0.779445
[1000]	training's binary_logloss: 0.231838	training's amex_metric: 0.802909	valid_1's binary_logloss: 0.243065	valid_1's amex_metric: 0.779673
[1500]	training's binary_logloss: 0.213796	training's amex_metric: 0.809686	valid_1's binary_logloss: 0.229194	valid_1's amex_metric: 0.780176
[2000]	training's binary_logloss: 0.205736	training's amex_metric: 0.815258	valid_1's binary_logloss: 0.225705	valid_1's amex_metric: 0.778858
[2500]	training's binary_logloss: 0.201901	training's amex_metric: 0.820434	valid_1's binary_logloss: 0.225347	valid_1's amex_metric: 0.779131


KeyboardInterrupt: 

In [None]:
val_pred = model.predict(X_test)
amex_metric(y_test, val_pred)
        

In [None]:
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': 42,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits=1, shuffle=True, random_state=0)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = lgb_amex_metric
            )
        # Save best model
        joblib.dump(model, f'/content/drive/MyDrive/Amex/Models/lgbm_fold{fold}_seed{CFG.seed}.pkl')
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict(test[features])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(f'/content/drive/MyDrive/Amex/OOF/oof_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'/content/drive/MyDrive/Amex/Predictions/test_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    