In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install -q --upgrade numpy pandas tqdm torch catalyst==20.09

[K     |████████████████████████████████| 14.5MB 235kB/s 
[K     |████████████████████████████████| 10.5MB 59.8MB/s 
[K     |████████████████████████████████| 71kB 11.2MB/s 
[K     |████████████████████████████████| 460kB 53.4MB/s 
[K     |████████████████████████████████| 163kB 58.4MB/s 
[K     |████████████████████████████████| 317kB 61.2MB/s 
[K     |████████████████████████████████| 71kB 11.8MB/s 
[31mERROR: tensorflow 2.3.0 has requirement numpy<1.19.0,>=1.16.0, but you'll have numpy 1.19.2 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.0.0; python_version >= "3.0", but you'll have pandas 1.1.2 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
[?25h

In [3]:
import os
import json
import pickle
import torch
from bisect import bisect_left, bisect_right
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

# GPU hack if you need
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Data

Columns
- `party_rk` – client unique identifier
- `account_rk` – client account unique identifier
- `financial_account_type_cd` – debit/credit card flag
- `transaction_dttm` – operation datetime
- `transaction_type_desc` – purchase/payment/...
- `transaction_amt_rur` – transaction price
- `merchant_type` - DUTY FREE STORES/FUEL DEALERS/RESTAURANTS/ etc
- `merchant_group_rk` - McDonald's/Wildberries/ etc

It's important that table is already sorted by `transaction_dttm` column!!!

In [7]:
!ls /content/drive/My\ Drive/hack_tink2

2020_hack_baseline_public.ipynb		  baseline.pth
2020_hack_baseline_public_upgraded.ipynb  mappings.json
2020_hack_model.ipynb			  party2category.pkl
avk_hackathon_data_account_x_balance.csv  party2dates.pkl
avk_hackathon_data_party_products.csv	  party2merchant_type.pkl
avk_hackathon_data_party_x_socdem.csv	  party2sum.pkl
avk_hackathon_data_story_logs.csv	  party2trans_type.pkl
avk_hackathon_data_story_texts.csv	  users_spendings_per_month.csv
avk_hackathon_data_transactions.csv


In [5]:
DATADIR = "/content/drive/My Drive/hack_tink2" # "./data"
transactions_path = f"{DATADIR}/avk_hackathon_data_transactions.csv"
# pd.read_csv(f"{DATADIR}/avk_hackathon_data_transactions.csv")

In [30]:
socdem = pd.read_csv(f'{DATADIR}/avk_hackathon_data_party_x_socdem.csv')
users_spendings_per_month = pd.read_csv(f"{DATADIR}/users_spendings_per_month.csv", index_col=0)

In [39]:
users_spendings_per_month = users_spendings_per_month.dropna()
active_users_mask = (users_spendings_per_month.values<5000)
active_users_mask = (np.sum(active_users_mask, axis=0)<=1)
inactive_users_mask = (np.sum(active_users_mask, axis=0)>1)

active_users = users_spendings_per_month.columns[active_users_mask].values.astype(int)
inactive_users = users_spendings_per_month.columns[inactive_users_mask].astype(int)


In [41]:
socdem['children_0'] = socdem['children_cnt'].apply(lambda x: int(x==0))
socdem['children_1'] = socdem['children_cnt'].apply(lambda x: int(x>= 1 and x<=2))
socdem['children_3'] = socdem['children_cnt'].apply(lambda x: int(x>2))

socdem['gender_f'] = socdem['gender_cd'].apply(lambda x: int(x=='F'))
socdem['gender_m'] = socdem['gender_cd'].apply(lambda x: int(x=='M'))
socdem['gender_nan'] = socdem['gender_cd'].apply(lambda x: int(x!='F' and x!='M'))

socdem['age_0'] = socdem['age'].apply(lambda x: int(x<=20))
socdem['age_1'] = socdem['age'].apply(lambda x: int(x>20 and x<=30))
socdem['age_2'] = socdem['age'].apply(lambda x: int(x>30 and x<=50))
socdem['age_3'] = socdem['age'].apply(lambda x: int(x>50))

socdem['marital_0'] = socdem['marital_status_desc'].apply(lambda x: int(x=='Женат/замужем' or x=='Гражданский брак'))
socdem['marital_1'] = socdem['marital_status_desc'].apply(lambda x: int(x=='Холост/не замужем' or x=='Не проживает с супругом (ой)'))
socdem['marital_2'] = socdem['marital_status_desc'].apply(lambda x: int(x=='Холост/не замужем' or x=='Не проживает с супругом (ой)'))

socdem = socdem.drop(columns=['gender_cd', 'age', 'marital_status_desc', 'children_cnt'])

## Mappings
~1 min

In [None]:
# Prepare & save mappings
mappings = defaultdict(dict)
unk_token = "<UNK>"


def create_mapping(values):
    mapping = {unk_token: 0}
    for v in values:
        if not pd.isna(v):
            mapping[str(v)] = len(mapping)

    return mapping


for col in tqdm(
    [
        "transaction_type_desc",
        "merchant_rk",
        "merchant_type",
        "merchant_group_rk",
        "category",
        "financial_account_type_cd",
    ]
):

    col_values = (
        pd.read_csv(transactions_path, usecols=[col])[col]
        .fillna(unk_token)
        .astype(str)
    )
    mappings[col] = create_mapping(col_values.unique())
    del col_values


with open(f"{DATADIR}/mappings.json", "w") as f:
    json.dump(mappings, f)

100%|██████████| 6/6 [01:10<00:00, 11.72s/it]


In [42]:
# load mappings
with open(f"{DATADIR}/mappings.json", 'r') as f:
    mappings = json.load(f)

## Parse transactions by users
~ 40 min

In [None]:
# Prepare & save client data
party2dates = defaultdict(list)  # for each party save a series of the transaction dates 
party2sum = defaultdict(list)  # for each party save a series of the transaction costs 
party2merchant_type = defaultdict(list)  # for each party save a series of the transaction_type 
party2trans_type = defaultdict(list)  # for each party save a series of the transaction merchant_type

usecols = [
    "party_rk",
    "transaction_dttm",
    "transaction_amt_rur",
    "merchant_type",
    "transaction_type_desc",
    "category"
]

for chunk in tqdm(
    pd.read_csv(transactions_path, usecols=usecols, chunksize=100_000)
):

    chunk["merchant_type"] = (
        chunk["merchant_type"].fillna(unk_token).astype(str)
    )
    chunk["transaction_type_desc"] = (
        chunk["transaction_type_desc"].fillna(unk_token).astype(str)
    )
    chunk["category"] = (
        chunk["category"].fillna(unk_token).astype(str)
    )
    chunk["transaction_amt_rur"] = chunk["transaction_amt_rur"].fillna(0)

    for i, row in chunk.iterrows():
        party2dates[row.party_rk].append(row.transaction_dttm)
        party2sum[row.party_rk].append(row.transaction_amt_rur)
        party2merchant_type[row.party_rk].append(
            mappings["merchant_type"][row.merchant_type]
        )
        party2trans_type[row.party_rk].append(
            mappings["transaction_type_desc"][row.transaction_type_desc]
        )
        party2category[row.party_rk].append(mappings["category"][row.category])

    del chunk

pickle.dump(party2dates, open(f"{DATADIR}/party2dates.pkl", "wb"))
pickle.dump(party2sum, open(f"{DATADIR}/party2sum.pkl", "wb"))
pickle.dump(party2merchant_type, open(f"{DATADIR}/party2merchant_type.pkl", "wb"))
pickle.dump(party2trans_type, open(f"{DATADIR}/party2trans_type.pkl", "wb"))
pickle.dump(party2category, open(f"{DATADIR}/party2category.pkl", "wb"))

120it [55:35, 27.80s/it]


In [43]:
# load client data
party2dates = pickle.load(open(f"{DATADIR}/party2dates.pkl", 'rb'))
party2sum = pickle.load(open(f"{DATADIR}/party2sum.pkl", 'rb'))
party2merchant_type = pickle.load(open(f"{DATADIR}/party2merchant_type.pkl", 'rb'))
party2trans_type = pickle.load(open(f"{DATADIR}/party2trans_type.pkl", 'rb'))
party2category = pickle.load(open(f"{DATADIR}/party2category.pkl", 'rb'))

## PyTorch dataset

In [None]:
from sklearn.model_selection import train_test_split

train_party, valid_party = train_test_split(
    pd.read_csv(transactions_path, usecols=['party_rk']).party_rk.unique(), 
    train_size=0.8, random_state=42
)

print(f'Train: {len(train_party)} Val: {len(valid_party)}')

Train: 40000 Val: 10000


In [44]:
# create the dataset using active users only

from sklearn.model_selection import train_test_split

train_party, valid_party = train_test_split(
    active_users, 
    train_size=0.8, random_state=42
)

print(f'Train: {len(train_party)} Val: {len(valid_party)}')

Train: 4809 Val: 1203


In [127]:
inactive_users.shape

(1, 49083)

In [129]:
# create the dataset using inactive users only

from sklearn.model_selection import train_test_split

train_party_inactive, valid_party_inactive = train_test_split(
    inactive_users[0], 
    train_size=0.8, random_state=42
)

print(f'Train: {len(train_party_inactive)} Val: {len(valid_party_inactive)}')

Train: 39266 Val: 9817


In [45]:
predict_period_len = 60  # -- days
train_predict_dates = (
    pd.date_range("2019-03-01", "2019-10-31", freq="MS")
    .strftime("%Y-%m-%d")
    .tolist()
)
valid_predict_dates = (
    pd.date_range("2019-11-01", "2019-12-31", freq="MS")
    .strftime("%Y-%m-%d")
    .tolist()
)
submission_predict_dates = (
    pd.date_range("2020-01-01", "2020-02-28", freq="2MS")
    .strftime("%Y-%m-%d")
    .tolist()
)

In [46]:
def prepare_data(party_list, mode="train"):
    """
    This function define the pipeline of the creation of train and valid samples.
    We consider each client from party_list. For each client take each 
    predict_period_start from predict_dates list. All client transaction before
    this date is our features. Next, we look at the customer's transactions in 
    the next two months. This transactions should be predicted. It will form 
    our labels vector.
    """
    data_rk = []
    data_sum = []
    data_trans_type = []
    data_merchant_type = []
    data_labels = []
    data_category = []

    for party_rk in tqdm(party_list):
        date_series = party2dates[party_rk]
        sum_series = party2sum[party_rk]
        merch_type_series = party2merchant_type[party_rk]
        trans_type_series = party2trans_type[party_rk]
        category_series = party2category[party_rk]

        if mode == "train":
            predict_dates = train_predict_dates
        elif mode == "valid":
            predict_dates = valid_predict_dates
        elif mode == "submission":
            predict_dates = submission_predict_dates
        else:
            raise Exception("Unknown mode")

        for predict_period_start in predict_dates:

            predict_period_end = datetime.strftime(
                datetime.strptime(predict_period_start, "%Y-%m-%d")
                + timedelta(days=predict_period_len),
                "%Y-%m-%d",
            )

            l, r = (
                bisect_left(date_series, predict_period_start),
                bisect_right(date_series, predict_period_end),
            )

            history_merch_type = np.array(merch_type_series[:l])
            history_sum = np.array(sum_series[:l])
            history_trans_type = trans_type_series[:l]
            history_category = category_series[:l]
            predict_merch = merch_type_series[l:r]

            if predict_merch and l or mode not in ("train", "valid"):
                # history_normalization = [np.sum(history_sum[history_merch_type==t]) for t in history_merch_type]
                # data_sum.append(history_sum / history_normalization)
                data_rk.append(party_rk)
                data_sum.append(history_sum)
                data_trans_type.append(history_trans_type)
                data_merchant_type.append(history_merch_type)
                data_category.append(history_category)
                data_labels.append(predict_merch)

    return data_rk, data_sum, data_trans_type, data_merchant_type, data_category, data_labels

In [47]:
# active users
train_rk, train_sum, train_trans_type, train_merchant_type, train_category, train_labels = prepare_data(
    train_party, mode="train"
)
valid_rk, valid_sum, valid_trans_type, valid_merchant_type, valid_category, valid_labels = prepare_data(
    valid_party, mode="valid"
)

100%|██████████| 4809/4809 [00:02<00:00, 1683.89it/s]
100%|██████████| 1203/1203 [00:00<00:00, 4515.91it/s]


In [130]:
# inactive users
train_rk_inactive, train_sum_inactive, train_trans_type_inactive, train_merchant_type_inactive, train_category_inactive, train_labels_inactive = prepare_data(
    train_party_inactive, mode="train"
)
valid_rk_inactive, valid_sum_inactive, valid_trans_type_inactive, valid_merchant_type_inactive, valid_category_inactive, valid_labels_inactive = prepare_data(
    valid_party_inactive, mode="valid"
)

100%|██████████| 39266/39266 [00:12<00:00, 3101.76it/s]
100%|██████████| 9817/9817 [00:02<00:00, 4245.32it/s]


## PyTorch loaders

In [48]:
import torch
from torch.utils.data import Dataset, DataLoader

In [62]:
MERCH_TYPE_NCLASSES = len(mappings['merchant_type'])
TRANS_TYPE_NCLASSES = len(mappings['transaction_type_desc'])
TRANS_CATEGORY_NCLASSES = len(mappings['category'])
PADDING_LEN = 300

In [78]:
class RSDataset(Dataset):
    def __init__(self, users_df, data_rk, data_sum, data_trans_type, data_merchant_type, data_category, labels):
        super(RSDataset, self).__init__()
        self.users_df = users_df.set_index(keys=['party_rk'])
        self.data_rk = data_rk
        self.data_sum = data_sum
        self.data_trans_type = data_trans_type
        self.data_merchant_type = data_merchant_type
        self.data_category = data_category
        self.labels = labels

    def __len__(self):
        return len(self.data_sum)

    def __getitem__(self, idx):
        targets = np.zeros((MERCH_TYPE_NCLASSES - 1,), dtype=np.float32)
        for m in self.labels[idx]:
            if m:  # skip UNK, UNK-token should not be predicted
                targets[m - 1] = 1.0

        item = {
            "features": {},
            "targets": targets,
        }

        sum_feature = np.array(self.data_sum[idx][-PADDING_LEN:])
        sum_feature = np.vectorize(lambda s: np.log(1 + s))(sum_feature)
        if sum_feature.shape[0] < PADDING_LEN:
            pad = np.zeros(
                (PADDING_LEN - sum_feature.shape[0],), dtype=np.float32
            )
            sum_feature = np.hstack((sum_feature, pad))
        item["features"]["sum"] = torch.from_numpy(sum_feature).float()

        for feature_name, feature_values in zip(
            ["trans_type", "merchant_type", "category"],
            [self.data_trans_type[idx], self.data_merchant_type[idx], self.data_category[idx]],
        ):

            feature_values = np.array(feature_values[-PADDING_LEN:])
            mask = np.ones(feature_values.shape[0], dtype=np.float32)
            if feature_values.shape[0] < PADDING_LEN:
                feature_values = np.append(
                    feature_values,
                    np.zeros(
                        PADDING_LEN - feature_values.shape[0], dtype=np.int64
                    ),
                )
                mask = np.append(
                    mask,
                    np.zeros(PADDING_LEN - mask.shape[0], dtype=np.float32),
                )
            item["features"][feature_name] = torch.from_numpy(feature_values).long()
            item["features"][f"{feature_name}_mask"] = torch.from_numpy(mask).float()

        party_rk = self.data_rk[idx]
        item["features"]["user_data"] = self.users_df.loc[party_rk].values

        return item

In [137]:
# active users
train_dataset = RSDataset(
    socdem, train_rk, train_sum, train_trans_type, train_merchant_type, train_category, train_labels
)
valid_dataset = RSDataset(
    socdem, valid_rk, valid_sum, valid_trans_type, valid_merchant_type, train_category, valid_labels
)


In [138]:
# inactive users
train_dataset_inactive = RSDataset(
    socdem, train_rk_inactive, train_sum_inactive, train_trans_type_inactive, train_merchant_type_inactive, train_category_inactive, train_labels_inactive
)
valid_dataset_inactive = RSDataset(
    socdem, valid_rk_inactive, valid_sum_inactive, valid_trans_type_inactive, valid_merchant_type_inactive, train_category_inactive, valid_labels_inactive
)


In [139]:
train_loader = DataLoader(
    train_dataset, batch_size=64, shuffle=True, num_workers=2
)
valid_loader = DataLoader(
    valid_dataset, batch_size=64, shuffle=False, num_workers=2
)

In [140]:
# inactive users
train_loader_inactive = DataLoader(
    train_dataset_inactive, batch_size=64, shuffle=True, num_workers=2
)
valid_loader_inactive = DataLoader(
    valid_dataset_inactive, batch_size=64, shuffle=False, num_workers=2
)

In [None]:
# sanity check
for i in tqdm(range(len(train_loader))):
    batch = next(iter(train_loader))

## Model

This is the baseline model for predicting purchases in `merchant_type` in the next 2 months

In [58]:
import torch.nn as nn
from collections import OrderedDict

In [141]:
params = {
    'merchant_type_emb_dim': 64,
    'trans_type_embedding': 3,
    'category_embedding_dim': 32,
    'transformer_nhead': 2,
    'transformer_dim_feedforward': 256,
    'transformer_dropout': 0.1,
    'dense_unit': 256,
    'num_layers': 4,
}

In [None]:
MERCH_TYPE_NCLASSES, TRANS_TYPE_NCLASSES

(458, 5)

In [142]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        self.merchant_type_embedding = nn.Embedding(
            MERCH_TYPE_NCLASSES, params["merchant_type_emb_dim"]
        )
        self.trans_type_embedding = nn.Embedding(
            TRANS_TYPE_NCLASSES, params["trans_type_embedding"]
        )

        self.category_embedding = nn.Embedding(
            TRANS_CATEGORY_NCLASSES, params["category_embedding_dim"]
        )

        embedding_size = (
            params["merchant_type_emb_dim"]
            + params["trans_type_embedding"]
            + params["category_embedding_dim"]
            + 14  # user info dim
            + 1
        )

        transformer_blocks = []
        for i in range(params["num_layers"]):
            transformer_block = nn.TransformerEncoderLayer(
                d_model=embedding_size,
                nhead=params["transformer_nhead"],
                dim_feedforward=params["transformer_dim_feedforward"],
                dropout=params["transformer_dropout"],
            )
            transformer_blocks.append(
                (f"transformer_block_{i}", transformer_block)
            )

        self.transformer_encoder = nn.Sequential(
            OrderedDict(transformer_blocks)
        )

        self.linear = nn.Linear(
            in_features=embedding_size, out_features=params["dense_unit"]
        )
        self.scorer = nn.Linear(
            in_features=params["dense_unit"],
            out_features=MERCH_TYPE_NCLASSES - 1,
        )

    def forward(self, features):

        merchant_type_emb = self.merchant_type_embedding(features["merchant_type"].to(device))
        trans_type_emb = self.trans_type_embedding(features["trans_type"].to(device))
        category_emb = self.category_embedding(features["category"].to(device))

        merchant_type_emb = merchant_type_emb * features["merchant_type_mask"].unsqueeze(-1).to(device)
        trans_type_emb = trans_type_emb * features["trans_type_mask"].unsqueeze(-1).to(device)
        category_emb = category_emb * features["category_mask"].unsqueeze(-1).to(device)

        
        embeddings = torch.cat(
            (merchant_type_emb, trans_type_emb, category_emb, features["user_data"].unsqueeze(-2).repeat((1, 300, 1)).to(device), features["sum"].unsqueeze(-1).to(device)),
            dim=-1,
        )

        transformer_output = self.transformer_encoder(embeddings)
        pooling = torch.mean(transformer_output, dim=1)
        linear = torch.tanh(self.linear(pooling))
        merch_logits = self.scorer(linear)

        return merch_logits

### One-batch-check

In [143]:
model = Model().to(device)
criterion = nn.BCEWithLogitsLoss()
batch = next(iter(train_loader))
output = model(batch['features'])
loss = criterion(output, batch['targets'].to(device))
print(loss)

tensor(0.6982, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


## Train loop with [Catalyst](https://github.com/catalyst-team/catalyst)

[A comprehensive step-by-step guide to basic and advanced features](https://github.com/catalyst-team/catalyst#step-by-step-guide).

---



In [114]:
from catalyst import dl, utils
from catalyst.utils import metrics

## Custom metrics for this hackathon

In [115]:
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from catalyst.utils.metrics.functional import preprocess_multi_label_metrics
from catalyst.utils.torch import get_activation_fn


def multi_label_metrics(
    outputs: torch.Tensor,
    targets: torch.Tensor,
    threshold: Union[float, torch.Tensor],
    activation: Optional[str] = None,
    eps: float = 1e-7,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Computes multi-label precision for the specified activation and threshold.

    Args:
        outputs (torch.Tensor): NxK tensor that for each of the N examples
            indicates the probability of the example belonging to each of
            the K classes, according to the model.
        targets (torch.Tensor): binary NxK tensort that encodes which of the K
            classes are associated with the N-th input
            (eg: a row [0, 1, 0, 1] indicates that the example is
            associated with classes 2 and 4)
        threshold (float): threshold for for model output
        activation (str): activation to use for model output
        eps (float): epsilon to avoid zero division
    
    Extended version of 
        https://github.com/catalyst-team/catalyst/blob/master/catalyst/utils/metrics/accuracy.py#L58

    Returns:
        computed multi-label metrics
    """
    outputs, targets, _ = preprocess_multi_label_metrics(
        outputs=outputs, targets=targets
    )
    activation_fn = get_activation_fn(activation)
    outputs = activation_fn(outputs)

    outputs = (outputs > threshold).long()

    accuracy = (targets.long() == outputs.long()).sum().float() / np.prod(
        targets.shape
    )

    intersection = (outputs.long() * targets.long()).sum(axis=1).float()
    num_predicted = outputs.long().sum(axis=1).float()
    num_relevant = targets.long().sum(axis=1).float()
    union = num_predicted + num_relevant

    # Precision = ({predicted items} && {relevant items}) / {predicted items}
    precision = intersection / (num_predicted + eps * (num_predicted == 0))
    # Recall = ({predicted items} && {relevant items}) / {relevant items}
    recall = intersection / (num_relevant + eps * (num_relevant == 0))
    # IoU = ({predicted items} && {relevant items}) / ({predicted items} || {relevant items})
    iou = (intersection + eps * (union == 0)) / (union - intersection + eps)

    return accuracy, precision.mean(), recall.mean(), iou.mean()


def precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
):
    """
    Computes precision at cutoff k for one sample

    Args:
       actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
       predicted (torch.Tensor): binary tensor that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       k (int): parameter k of precison@k

    Returns:
       Computed value of precision@k for given sample
    """
    p_at_k = 0.0
    for item in predicted[:k]:
        if actual[item]:
            p_at_k += 1
    p_at_k /= k

    return p_at_k


def average_precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
) -> float:
    """
    Computes average precision at cutoff k for one sample

    Args:
      actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
      predicted (torch.Tensor): binary tensor that encodes which of the K
          classes are associated with the N-th input
          (eg: a row [0, 1, 0, 1] indicates that the example is
          associated with classes 2 and 4)
      k (int): parameter k of AP@k

    Returns:
        Computed value of AP@k for given sample
    """
    ap_at_k = 0.0
    for idx, item in enumerate(predicted[:k]):
        if actual[item]:
            ap_at_k += precision_at_k(actual, predicted, k=idx + 1)
    ap_at_k /= min(k, actual.sum().cpu().numpy())
    

    return ap_at_k


def mean_average_precision_at_k(
    output: torch.Tensor, target: torch.Tensor, top_k: Tuple[int, ...] = (1,)
) -> List[float]:
    """
    Computes mean_average_precision_at_k at set of cutoff parameters K

    Args:
       outputs (torch.Tensor): NxK tensor that for each of the N examples
           indicates the probability of the example belonging to each of
           the K classes, according to the model.
       targets (torch.Tensor): binary NxK tensort that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       top_k (tuple): list of parameters k at which map@k will be computed


    Returns:
       List of computed values of map@k at each cutoff k from topk
    """
    max_k = max(top_k)
    batch_size = target.size(0)

    _, top_indices = output.topk(k=max_k, dim=1, largest=True, sorted=True)

    result = []
    for k in top_k:  # loop over k
        map_at_k = 0.0
        for actual_target, predicted_items in zip(
            target, top_indices
        ):  # loop over samples
            map_at_k += average_precision_at_k(
                actual_target, predicted_items, k
            )
        map_at_k = map_at_k / batch_size
        result.append(map_at_k)

    return result

In [117]:
# What is Runner?
# https://catalyst-team.github.io/catalyst/api/core.html#runner
class CustomRunner(dl.Runner):
    # def __init__(self, criterion):
    #     super().__init__()
    #     self.criterion = criterion
    #     print(self.criterion)


    def _handle_batch(self, batch):
        # model train/valid step
        features, targets = batch["features"], batch["targets"]
        logits = self.model(features)
        scores = torch.sigmoid(logits)

        
        loss = nn.BCEWithLogitsLoss()(logits, targets.to(device))
        accuracy, precision, recall, iou = multi_label_metrics(
            logits, targets, threshold=0.5, activation="Sigmoid"
        )
        map05, map10, map20, map30 = mean_average_precision_at_k(
            scores, targets, top_k=(5, 10, 20, 30)
        )
        batch_metrics = {
            "loss": loss,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "iou": iou,
            "map05": map05,
            "map10": map10,
            "map20": map20,
            "map30": map30
        }
        
        self.input = {"features": features, "targets": targets}
        self.output = {"logits": logits, "scores": scores}
        self.batch_metrics.update(batch_metrics)

        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
    
    def predict_batch(self, batch):
        # model inference step
        batch = utils.maybe_recursive_call(batch, "to", device=self.device)
        logits = self.model(batch["features"])
        scores = torch.sigmoid(logits)
        return scores

In [149]:
model_active = Model().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer_active = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler_active = torch.optim.lr_scheduler.MultiStepLR(optimizer_active, [10], gamma=0.1, last_epoch=-1)
loaders = {"train": train_loader, "valid": valid_loader}

In [None]:
# For other minimal examples, please follow the link below
# https://github.com/catalyst-team/catalyst#minimal-examples
runner_active = CustomRunner(criterion)
# model training
runner.train(
    model=model_active,
    criterion=criterion,
    optimizer=optimizer_active,
    scheduler=scheduler_active,
    loaders=loaders,
    logdir="./logs",
    num_epochs=15,
    verbose=False,
    load_best_on_end=True,
    overfit=False,  #  <<<--- DO NOT FORGET TO MAKE IT ``False`` 
                    #  (``True`` uses only one batch to check pipeline correctness)
    callbacks=[
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html
        # dl.AveragePrecisionCallback(input_key="targets", output_key="scores", prefix="ap"),
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
        # dl.AUCCallback(input_key="targets", output_key="scores", prefix="auc"),
    ],
    main_metric="iou", # "ap/mean", 
    minimize_metric=False,
)

[2020-09-19 22:47:22,730] 
1/15 * Epoch 3 (train): accuracy=0.0452 | iou=0.0452 | loss=0.7008 | map05=0.0057 | map10=0.0059 | map20=0.0049 | map30=0.0052 | precision=0.0452 | recall=1.0000
1/15 * Epoch 3 (valid): accuracy=0.0405 | iou=0.0405 | loss=0.7039 | map05=0.0130 | map10=0.0109 | map20=0.0081 | map30=0.0079 | precision=0.0405 | recall=1.0000
[2020-09-19 22:47:22,730] 
1/15 * Epoch 3 (train): accuracy=0.0452 | iou=0.0452 | loss=0.7008 | map05=0.0057 | map10=0.0059 | map20=0.0049 | map30=0.0052 | precision=0.0452 | recall=1.0000
1/15 * Epoch 3 (valid): accuracy=0.0405 | iou=0.0405 | loss=0.7039 | map05=0.0130 | map10=0.0109 | map20=0.0081 | map30=0.0079 | precision=0.0405 | recall=1.0000
[2020-09-19 22:47:22,730] 
1/15 * Epoch 3 (train): accuracy=0.0452 | iou=0.0452 | loss=0.7008 | map05=0.0057 | map10=0.0059 | map20=0.0049 | map30=0.0052 | precision=0.0452 | recall=1.0000
1/15 * Epoch 3 (valid): accuracy=0.0405 | iou=0.0405 | loss=0.7039 | map05=0.0130 | map10=0.0109 | map20=0.00

In [None]:
torch.save(model_active.state_dict(), '/content/drive/My Drive/hack_tink2/baseline_active.pth')

In [None]:
model_inactive = Model().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer_inactive = torch.optim.Adam(model_inactive.parameters(), lr=0.001)
scheduler_inactive = torch.optim.lr_scheduler.MultiStepLR(optimizer_inactive, [10], gamma=0.1, last_epoch=-1)
loaders = {"train": train_loader_inactive, "valid": valid_loader_inactive}

In [None]:
# For other minimal examples, please follow the link below
# https://github.com/catalyst-team/catalyst#minimal-examples
runner_inactive = CustomRunner(criterion)
# model training
runner_inactive.train(
    model=model_inactive,
    criterion=criterion,
    optimizer=optimizer_inactive,
    scheduler=scheduler_inactive,
    loaders=loaders,
    logdir="./logs",
    num_epochs=15,
    verbose=False,
    load_best_on_end=True,
    overfit=False,  #  <<<--- DO NOT FORGET TO MAKE IT ``False`` 
                    #  (``True`` uses only one batch to check pipeline correctness)
    callbacks=[
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html
        # dl.AveragePrecisionCallback(input_key="targets", output_key="scores", prefix="ap"),
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
        # dl.AUCCallback(input_key="targets", output_key="scores", prefix="auc"),
    ],
    main_metric="iou", # "ap/mean", 
    minimize_metric=False,
)

In [None]:
torch.save(model_inactive.state_dict(), '/content/drive/My Drive/hack_tink2/baseline_inactive.pth')

## Your task

We suggest you to improve this baseline. Feel free to use any kind of model architectures, loss functions, inputs, etc. in your experiments.


YOUR TASK is to predict purchases in `merchant_type` in **January-February 2020** for all the clients (50k) from the given dataset.

SUBMISSION FORMAT: You should submit a `.csv` file in the following format. 

The submission file should contain two columns:
* `party_rk` -- client unique identifier
* `recommendations` -- list of the **top 30** predicted `merchant_type`, sorted by predicted proba (pay attention!) **separated by commas**. 

The `.csv` file separator should be **semicolon (";")**. The submission file example can be generated by the pipeline shown below.

EVALUATION: Your submission will be evaluated by metric **MAP@30**. Scores for this part of the hackathon will be given according to the value of this metric.

Good luck!

## Submission file example

In [None]:
active_rk, active_sum, active_trans_type, active_merchant_type, active_category, active_labels = prepare_data(
    active_users, mode="submission"
)

inactive_rk, inactive_sum, inactive_trans_type, inactive_merchant_type, inactive_category, inactive_labels = prepare_data(
    inactive_users[0], mode="submission"
)

train_dataset_inactive = RSDataset(
    socdem, train_rk_inactive, train_sum_inactive, train_trans_type_inactive, train_merchant_type_inactive, train_category_inactive, train_labels_inactive
)

active_dataset = RSDataset(
   socdem, active_rk, active_sum, active_trans_type, active_merchant_type, active_category, active_labels
)
active_loader = DataLoader(
    active_dataset, batch_size=64, shuffle=False, num_workers=8, drop_last=False)

inactive_dataset = RSDataset(
   socdem, inactive_rk, inactive_sum, inactive_trans_type, inactive_merchant_type, inactive_category, inactive_labels
)
inactive_loader = DataLoader(
    inactive_dataset, batch_size=64, shuffle=False, num_workers=8, drop_last=False)

In [None]:
# get predictions from the model
predictions = []
for scores in tqdm(active_runner.predict_loader(loader=active_loader), total = len(active_loader)):
    _, top_indices = scores.topk(k=30, dim=1, largest=True, sorted=True)
    top_indices += 1
    predictions += top_indices.detach().cpu().tolist()

for scores in tqdm(inactive_runner.predict_loader(loader=inactive_loader), total = len(inactive_loader)):
    _, top_indices = scores.topk(k=30, dim=1, largest=True, sorted=True)
    top_indices += 1
    predictions += top_indices.detach().cpu().tolist()

In [None]:
# inverse mapping for merchant_type in predictions
merchant_type_inverse_mapping = {k: v for v, k in mappings['merchant_type'].items()}
def inverse_mapping(x):
    return list(map(merchant_type_inverse_mapping.get, x))

predictions = list(map(inverse_mapping, predictions))

In [None]:
# create submission table
submission = pd.DataFrame({
    "party_rk" : active_rk + inactive_rk, 
    "recommendations" : predictions
})
submission['recommendations'] = submission['recommendations'].apply(lambda x: ",".join(map(str, x)))

submission.to_csv('/content/drive/My Drive/hack_tink2/submission_SpaceInside.csv', index=False, sep=";")