In [1]:
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm


## Load Data

In [2]:

main_df = pd.read_csv("transactions_train.csv", dtype={"article_id": str})
print('Training data shape:', main_df.shape)
main_df.head()

## Data pre-processing

In [9]:
main_df["t_dat"] = pd.to_datetime(main_df["t_dat"])

# Set only articles purchased in the past 2 years as active
active_articles = main_df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
print('Active transactions shape: ', active_articles.shape)

# Retain only rows with active articles

main_df = main_df[main_df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
print('Current shape of main_df: ', main_df.shape)

# Extract the week a transaction occured (assumed last week as week 0)
main_df["week"] = (main_df["t_dat"].max() - main_df["t_dat"]).dt.days // 7

# Label encode the article_id 
article_ids = np.concatenate([["placeholder"], np.unique(main_df["article_id"].values)])
le = LabelEncoder()
le.fit(article_ids)
main_df["article_id"] = le.transform(main_df["article_id"])

num_workers = 8
batch_size = 256
SEQ_LEN = 16
SEED = 0

In [15]:
main_df.head()

## Model definition on PyTorch

In [12]:
class RecSysModel(nn.Module):
    def __init__(self, article_shape):
        super(RecSysModel, self).__init__()
        
        # Each input sample defines the purchase of a single article.
        # We are training our model to predict this article based on the purchase history (input to model).
        # Purchase history is defined by a list (maximum number defined by seq_len) of articles recently purchased by a customer 
        # upto k weeks before current purchased article 
        
        # Create embedding vector for this purchase history
        self.artic_embedding = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        self.artic_likelihood = nn.Parameter(torch.zeros(article_shape[0]), requires_grad=True)
        
        # NN model using 1-D convolutions. Train this neural network to predict the current purchased article.
        self.top = nn.Sequential(nn.Conv1d(3, 64, kernel_size=1), nn.LeakyReLU(), nn.BatchNorm1d(32),
                                 nn.Conv1d(64, 32, kernel_size=1), nn.LeakyReLU(), nn.BatchNorm1d(8),
                                 nn.Conv1d(32, 8, kernel_size=1), nn.LeakyReLU(), nn.BatchNorm1d(8),
                                 nn.Conv1d(8, 1, kernel_size=1))
    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]
        
        x = self.artic_embedding(article_hist)
        x = F.normalize(x, dim=2)
        
        # (bs, seq_len, 72582)
        x = x@F.normalize(self.artic_embedding.weight).T
        
        x, indices = x.max(axis=1)
        
        # (bs, 1, 72582)
        x = x.clamp(1e-3, 0.999)
        
        # inverse sigmoid - converts probabilities to a real valued num.
        x = -torch.log(1/x - 1)
        
        max_week = week_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, week_hist.shape[1], 1))
        max_week = max_week.mean(axis=1).unsqueeze(1)
        
        x = torch.cat([x.unsqueeze(1), max_week, self.artic_likelihood[None, None, :].repeat(x.shape[0], 1, 1)], axis=1)
        x = self.top(x).squeeze(1)
        
        return x
    
    
model = RecSysModel((len(le.classes_), 512))
model = model.cuda()

## PyTorch Dataset Class Definition

In [13]:
# Create the PyTorch dataset class
class RecSysDataset(Dataset):
    def __init__(self, main_df, weeks, seq_len, train=True, is_test=False, create_dataset=True):
        
        # Maximum number of weeks for creating purchase history
        self.max_week_hist = 5
        
        # Create this purchase history dataframe 
        if create_dataset:
            if train:
                self.weeks = list(range(1, weeks))
                df = pd.concat([self.create_dataset(main_df, w) for w in self.weeks]).reset_index(drop=True)
            else:
                self.weeks = [0]
                df = pd.concat([self.create_dataset(main_df, w) for w in self.weeks]).reset_index(drop=True)

            self.df = df.reset_index(drop=True)
            self.seq_len = seq_len
            self.is_test = is_test
        
        else:
            self.df = main_df.reset_index(drop=True)
            self.seq_len = seq_len
            self.is_test = is_test
            
    
    def create_dataset(self, df, week):
        hist_df = df[(df["week"] > week) & (df["week"] <= week + self.max_week_hist)]
        hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
        hist_df.rename(columns={"week": 'week_history'}, inplace=True)

        target_df = df[df["week"] == week]
        target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
        target_df.rename(columns={"article_id": "target"}, inplace=True)
        target_df["week"] = week

        return target_df.merge(hist_df, on="customer_id", how="left")

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        # Target is a one-hot vector defining the current article purchased
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            target = torch.zeros(len(article_ids)).float()
            for t in row.target:
                target[t] = 1.0
        
        # Upto seq_len article_ids can be stored in purchase history
        article_hist = torch.zeros(self.seq_len).long()
        
        # The corresponding weeks for each article in the history 
        week_hist = torch.ones(self.seq_len).float()
        
        
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/self.max_week_hist/2
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/self.max_week_hist/2
                
        return article_hist, week_hist, target
    
RecSysDataset(main_df, 5, 64, train=False, is_test=False, create_dataset=True)[1]

## Training  and Validatation

In [None]:
# Model training function
def train(model, train_loader, val_loader, criterion, optimizer, epochs):

    np.random.seed(SEED)
    scaler = torch.cuda.amp.GradScaler()
    
    for e in range(epochs):
        
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        lr = lr_schedule(optimizer, e)
        losses = []

        for data in tbar:
            
            # Fetch inputs
            inputs = tuple(d.cuda() for d in input_data[:-1])
            target = input_data[-1].cuda()

            optimizer.zero_grad()
            
            # Forward pass and calculate loss
            with torch.cuda.amp.autocast():
                model_out = model(inputs)
                loss = criterion(model_out, target) + dice_loss(model_out, target)
            
            # Update model weights
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            losses.append(loss.detach().cpu().item())
            avg_loss = np.round(100*np.mean(losses), 4)
            tbar.set_description(f"Epoch: {e+1}; lr: {lr}; Loss: {avg_loss}")
            
        val_map = validate(model, val_loader)
        log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\nValidation MAP: {val_map}\n"
        print(log_text)
        
    return model

# Model validation
def validate(model, val_loader, k=12):

    model.eval()
    tbar = tqdm(val_loader, file=sys.stdout)
    maps = []
    
    with torch.no_grad():
        for input_data in tbar:

            inputs = tuple(d.cuda() for d in input_data[:-1])
            target = input_data[-1].cuda()

            model_out = model(inputs)
            _, indices = torch.topk(model_out, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                maps.append(calc_metric(indices[i], target[i]))
        
    
    return np.mean(maps)

### Helper Functions

In [None]:
def dice_loss(y_pred, y_true):

    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()

# Learning rate schedule
def lr_schedule(optimizer, epoch):

    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for param in optimizer.param_groups:
        param['lr'] = lr
        
    return lr

# Adam Optimizer definition
def get_optimizer(model):
    
    optim = torch.optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=3e-4, betas=(0.9, 0.999),eps=1e-08)
    return optim

# Calculate mAP metric on the validation set
def calc_metric(topk_preds, target_array, k=12):
    
    map_met = []
    tp, fp = 0, 0
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            map_met.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(map_met) / min(k, target_array.sum())

In [13]:
MODEL_NAME = "exp001"

# Dataloaders for training and validation datasets
val_dataset = RecSysDataset(main_df, 0, SEQ_LEN, train=False, is_test=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
                          pin_memory=False, drop_last=False)

train_dataset = RecSysDataset(train_df, weeks=5, seq_len=SEQ_LEN, train=True, is_test=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                          pin_memory=False, drop_last=True)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = get_optimizer(model)
model = train(model, train_loader, val_loader, criterion, optimizer, epochs=10)

## Finetune along with Validation Data

In [14]:
train_dataset = RecSysDataset(train_df[train_df["week"] < 4].append(val_df), 5, SEQ_LEN, train=True, is_test=False, create_dataset=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                          pin_memory=False, drop_last=True)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = get_optimizer(model)

model = train(model, train_loader, val_loader, criterion, optimizer, epochs=10)

## Model Inference

In [None]:
test_df = pd.read_csv('sample_submission.csv').drop("prediction", axis=1)
print('Test dataframe shape: ', test_df.shape)
test_df.head()

In [16]:
# For inference, gather article purchase history (past 5 weeks) for each customer id on sample_submission.csv
# Use this to make predictions
def create_test_dataset(test_df):
    max_week_hist = 5
    
    week = -1
    test_df["week"] = week 
    hist_df = main_df[(main_df["week"] > week) & (main_df["week"] <= week + max_week_hist)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    return test_df.merge(hist_df, on="customer_id", how="left")

test_df = create_test_dataset(test_df)
test_df.head()

In [18]:
test_ds = RecSysDataset(test_df, 0, SEQ_LEN, train=False, is_test=True, create_dataset=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
                          pin_memory=False, drop_last=False)

# Inference pipeline
def inference(model, loader, k=12):
    model.eval()
    tbar = tqdm(loader, file=sys.stdout)
    
    prediction_list = []
    
    with torch.no_grad():
        for data in tbar:
            inputs = tuple(d.cuda() for d in input_data[:-1])
            target = input_data[-1].cuda()

            model_out = model(inputs)
            _, indices = torch.topk(model_out, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                prediction_list.append(" ".join(list(le.inverse_transform(indices[i]))))
        
    
    return prediction_list

test_df["prediction"] = inference(model, test_loader)

## Save Outputs

In [19]:
test_df.to_csv("output.csv", index=False, columns=["customer_id", "prediction"])