In [3]:
import numpy as np
import pandas as pd 
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [4]:
from pd.params import *

In [5]:
train_data = pd.read_parquet(data_dir+"train_data.parquet")

In [13]:
train_data[["D_63", "D_64"]] = train_data[["D_63", "D_64"]].astype("category").apply(lambda x: x.cat.codes)

In [4]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

class CustomerData(Dataset):
    def __init__(self, data, cat_cols=cat_cols, test_mode=False, train_labels=None):
        self.data = data
        self.test_mode = test_mode
        customer_indices = data["customer_ID"].reset_index().set_index("customer_ID").groupby('customer_ID').apply(lambda x : x.to_numpy().reshape(-1, )).to_dict()
        self.customer_ids =tuple(customer_indices)
        self.customer_indices = tuple(customer_indices.values())
        self.train_labels = train_labels
        self.data_columns = data.columns.to_list()
        self.cont_cols = [col for col in self.data_columns if col not in cat_cols + ["customer_ID", "S_2", "target"]]

    def __len__(self):
        return len(self.customer_indices)

    def __getitem__(self, index):
        customer_data_indices = self.customer_indices[index]
        skiprows = customer_data_indices[0]+1
        nrows = customer_data_indices[-1] - customer_data_indices[0] + 1
        customer_data = self.data.iloc[skiprows: skiprows+nrows]
        customer_id = customer_data.customer_ID.iloc[0]
        
        #customer_data.drop(["customer_ID", "S_2"], axis=1, inplace=True)
        
        customer_cont_data = customer_data[self.cont_cols].fillna(0, axis=1)
        customer_cont_tensor_data = torch.as_tensor(customer_cont_data.values, dtype=torch.float32)
        customer_cat_data = customer_data[cat_cols].values
        
        feat = customer_cont_tensor_data.mean(dim=0)
        if self.test_mode:
            return feat, index
        else:
            customer_label = torch.as_tensor(self.train_labels.loc[customer_id].values, dtype=torch.float32)
            return feat, customer_label



In [5]:
train_labels = pd.read_csv(data_dir+"train_labels.csv", engine="pyarrow")
train_dataset = CustomerData(train_data, train_labels=train_labels.set_index("customer_ID"))


In [6]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
#test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [21]:
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim=177):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=input_dim)
        self.fc3 = nn.Linear(in_features=input_dim, out_features=1)
        
    def forward(self, feat):
        r = F.selu(self.fc1(feat))
        x = F.selu(self.fc2(r))
        x = torch.sigmoid(self.fc3(x+feat))
        return x
        

In [8]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

In [22]:
model = Model(input_dim=177)
optimizer = torch.optim.Adam(model.parameters(),)
criterion = torch.nn.BCELoss()

for epoch in range(5): 
    for cont_feat, clabel in train_loader:
        pred = model(cont_feat)
        loss = criterion(pred, clabel)
        
        # Update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(epoch, loss.item(), amex_metric(clabel.detach().numpy(), pred.detach().numpy()))
    

0 0.7262248992919922 -0.09603472955492569
0 0.638590931892395 0.041384118850008776
0 0.5799393653869629 0.24720089196869635
0 0.533631443977356 0.4226688861504425
0 0.5246046185493469 0.5137142898639532
0 0.49698853492736816 0.5610643057920819
0 0.46826478838920593 0.5982562644652083
0 0.44635605812072754 0.604474707869638
0 0.4197278916835785 0.618715212118027
0 0.4000680148601532 0.6158858431239643
0 0.3682839274406433 0.6225106161400954
0 0.36018434166908264 0.6224674252373543
0 0.3543711304664612 0.6122084363364506
0 0.34352144598960876 0.6237273679821989
0 0.33324134349823 0.6388435914608223
0 0.33321693539619446 0.6362082578236035
0 0.3319920599460602 0.6298962206833055
0 0.3266584277153015 0.6450599512785237
0 0.29888495802879333 0.6622379408166814
0 0.3203870952129364 0.6489243388852178
0 0.31640246510505676 0.6413691222334352
0 0.3106469511985779 0.6635717151277714
0 0.29710331559181213 0.6728484833679667
0 0.3043890595436096 0.6699597791645839
0 0.30091655254364014 0.67769804

In [48]:
torch.save(model.state_dict(), "model2")


In [11]:
test_data = pd.read_parquet(data_dir+"test_data.parquet")
test_data_size = test_data.shape[0]


In [12]:
test_dataset = CustomerData(test_data, test_mode=True)
test_loader = DataLoader(test_dataset, batch_size=test_data.shape[0])

In [23]:
for cont_feat, customer_index in test_loader:
    pred = model(cont_feat)


In [16]:
test_customer_ids = test_dataset.customer_ids

In [17]:
result = pd.DataFrame({"customer_ID":test_customer_ids, "prediction":pred.detach().numpy().reshape(-1)})

In [18]:
result.set_index("customer_ID").to_csv("first_sub.csv")

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.954528
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.931634
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.970219
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.996782
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.996252
...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.970684
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.995988
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.995963
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.990130


In [95]:
amex_metric(labels.target.values, pred.detach().numpy())

0.695263566416433

In [74]:
amex_metric(labels.target.values, pred.detach().numpy())

0.6889942212973945

In [None]:
model.eval()

