In [2]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim, Tensor
from torchmetrics import AveragePrecision, AUROC
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

from emde import calculate_absolute_emde_codes
from cleora_saas_api import CLI


# Enviroment
First we define all constants that will be used in this notebook. 

In [3]:
DATA_DIR = "/data1/lsienkiewicz/workshop"
TRAIN_PD_PATH = "2019-Oct_small.csv"
TARGETS_PATH = "train_target.npy"
VALIDATION_TARGETS_PATH = "test_target.npy"
USER_IDS = "user_ids.npy"
CLEORA_INPUT_FILE = "cleora_input.tsv"
EMBEDDINGS_NPZ = "embeddings.npz"
SKETCH_DEPTH = 20
SKETCH_WIDTH = 64
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
MAX_EPOCH = 1
ACCELERATOR = "gpu"
DEVICES = 1
NUM_WORKERS = 8
EXPERIMENT_NAME = "experiment_with_brands"


# Calculating embeddings with Cleora
The following function prepares input for cleora. First we load DataFrame with training data. Note that cleora works with timestamps as well. However, in our case we drop timestamps for simplicity. Finally we save the result as a tsv file, which is required input format for cleora. 

In [4]:
def prepare_cleora_input_file():
    train_df = pd.read_csv(TRAIN_PD_PATH)
    train_df.drop("event_time", axis=1, inplace=True)
    train_df = train_df[["user_id", "brand"]]
    with open(CLEORA_INPUT_FILE, "w") as tsv_file:
        train_df.to_csv(tsv_file, sep="\t", header=True, index=False)

In [5]:
prepare_cleora_input_file()

In [6]:
from cleora_saas_api import CLI

cleora = CLI()
cleora.login(
    "AMf-vByg7WAB7GhZsOpQU_PisBJINqw-IRncFm182Ly3R7JyUxRY0JIjM9hJXoNJ9Q9ceHwWUn0Ghc60J2jrxJVyUyZyf5mQLUElpb9DJbd5q-PXHNjE_QHRXAPEKNX2relRJycP6FOw2fxf8fngHEw6CvLS44nbuxIhTDd_b1w8JNkhaPIr-8GOJAL8OlV06cEmf6iJZnLqDSkIV5msh6WaUQRV0canHZb1o30SmRBxawHYs2-n7xFEDOO-H_ULoCLVDZHBlnA1ewnEqYMQkpNWc32atG-8HmitYWIG-P-OClL8YOh53Oy95kFIN6A4u2CK46KUN9qd1edD3gFvl_vjg9auC3ZwzQ"
)
cleora.show_runs()

logged in successfully
runId = WTJ6z1wL0QqBpC44BVhd 
 	 name = colab_clustering | inputName = cleora_input.tsv 

runId = Wm37T8mkV0bwDCbNt1ea 
 	 name = cleora_input.tsv_D256_I4 | inputName = cleora_input.tsv 

runId = lEMzy2jkWd1SpxUqrts7 
 	 name = colab_clustering | inputName = cleora_input.tsv 

runId = oq5P1ohX0NvP1cRfVre1 
 	 name = colab_clustering | inputName = cleora_input.tsv 

runId = vja1yPJHmgCXkrKI12RL 
 	 name = colab_clustering | inputName = cleora_input.tsv 



In [8]:

cleora.run(128, 4, input_path="cleora_input.tsv", run_name="colab_clustering")

-- Start --


-- Config to trigger run prepared --
-- Run started --
-- Logs: --
Started at 2024-03-20 12:22:07
Input file downloaded with 1669365 rows.
Number of rows in original data: 1669365
Number of rows after preprocessing: 1592254
Number of embedded nodes: 2012
Initializing Cleora.
Iteration 1/4 done
Iteration 2/4 done
Iteration 3/4 done
Iteration 4/4 done
Saving results.
-- Result download started --
-- Result download finished --


Next we open cleora.ai, which is web app that implements cleora algorithm. We run app with default initial embeddings and embedding dimension equal to 256. Finally we upload output embeddings and save them in the path specified by EMBEDDINGS_NPZ constant.

The following function is used to load embeddings.

In [9]:
def load_embeddings(embeddings_path: str):
    embeddings = np.load(embeddings_path)
    return embeddings["entity_id"], embeddings["vectors"]

We explain now the output of cleora.ai app.

In [10]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)
print(f"embeddings shape: {embeddings.shape}, embeddings dtype: {embeddings.dtype}")
print(f"brands_ids shape: {brands_ids.shape}")

embeddings shape: (2012, 128), embeddings dtype: float32
brands_ids shape: (2012,)


In [11]:
brands_ids[:10]

array(['misty', 'cyberpower', 'maxwell', 'shakira', 'panasonic',
       'samsonite', 'dauscher', 'collistar', 'tsubaki', 'gemei'],
      dtype='<U28')

Let us find brand which corresponds to some index and then print its embedding.

In [12]:
idx = 121
brands_ids[121]

'attribute'

In [13]:
embeddings[121]

array([-2.0500007 , -1.2169027 ,  1.8282223 , -0.01711618, -0.02667308,
        0.49978656,  1.6075191 , -0.6248146 ,  1.0568687 , -0.4983308 ,
       -0.6474467 ,  0.2727086 ,  1.5936657 ,  0.5171974 , -2.072433  ,
       -1.4648963 ,  0.2067378 , -0.60103714,  0.07396353,  0.12636733,
       -0.2800087 ,  0.18029667, -1.2833638 ,  0.49473673, -0.20840473,
       -1.6134431 ,  0.09388947,  1.1613934 , -0.3771942 ,  1.2553425 ,
       -1.3909296 ,  0.4226285 ,  0.41898215, -0.1103825 ,  1.8152021 ,
       -0.23743178, -0.48353943, -1.0445714 , -0.16908994, -1.9817761 ,
       -1.2700906 , -1.1548795 ,  0.82528865,  1.861263  ,  0.48486412,
        0.10013947,  0.2618105 ,  0.8240603 , -0.21977077, -0.26917735,
       -0.77202827, -0.316491  , -2.8853774 ,  1.04969   ,  0.62365454,
        0.36155334,  1.1546487 , -0.8376049 ,  0.5490513 ,  1.2329954 ,
       -0.90968317,  0.67430407, -1.9055952 ,  0.09410735,  0.06974255,
        2.1320164 , -0.30753285, -0.29606095,  1.0193535 , -0.09

# Implementing Dataset class

We explain here some details related to our implementation of Dataset class.

First we investigate the contents of training DataFrame.

In [14]:
inputs_df = pd.read_csv(TRAIN_PD_PATH).dropna()
inputs_df.head(10)

Unnamed: 0,event_time,brand,user_id
0,2019-10-01 00:02:14 UTC,samsung,543272936
1,2019-10-01 00:04:37 UTC,apple,551377651
2,2019-10-01 00:05:14 UTC,xiaomi,550121407
3,2019-10-01 00:06:02 UTC,xiaomi,514591159
4,2019-10-01 00:07:07 UTC,santeri,555332717
5,2019-10-01 00:09:26 UTC,apple,524601178
6,2019-10-01 00:09:33 UTC,apple,524325294
7,2019-10-01 00:09:54 UTC,apple,551377651
8,2019-10-01 00:10:08 UTC,apple,524325294
9,2019-10-01 00:10:56 UTC,oasis,548691404


We group train Dataframe by user and aggregate obtained groups by applying list construtor. This constructs Series that contains list of interactions of every user.

In [15]:
brands = inputs_df.groupby("user_id", group_keys=True)["brand"].apply(list)
brands.head(10)

user_id
264649825        [kiturami, kiturami]
284344819                     [apple]
293957954                    [xiaomi]
303160429                    [garmin]
304325717    [huawei, huawei, huawei]
318611205              [huawei, zeta]
336595257          [samsung, samsung]
340041246        [lg, lg, lg, lg, lg]
348815209                   [samsung]
362327778                     [apple]
Name: brand, dtype: object

In [16]:
user_ids = np.load(USER_IDS)
user_ids[0:10]

array([264649825, 284344819, 293957954, 303160429, 304325717, 318611205,
       336595257, 340041246, 348815209, 362327778])

Now we are ready to implement our custom dataset class.

In [104]:
class UsersBrandsDataset(Dataset):
    def __init__(
        self,
        absolute_codes: np.ndarray,
        brands_ids: np.ndarray,
        inputs_df_path: str,
        targets_path: str,
        user_ids_path: str,
        sketch_width: int,
        sketch_depth: int,
        sketch_decay: float = 0.94,
    ):
        """
        Args:
            absolute_codes (np.ndarray): Array of shape (num_brands, sketch_depth) containing the absolute codes for each item
            brands_ids (np.ndarray): Array of shape (num_brands) mapping each idx to corresponding brand 
            inputs_df_path (str): path to train dataframe
            targets_path (str): path to targets array
            sketch_width (int): width of the sketch
            sketch_depth (int): depth of the sketch
            sketch_decay (float): Decay factor for the sketch
        """
        self.absolute_codes = absolute_codes
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.sketch_decay = sketch_decay
        
        self.brand_to_ids = {brands_ids[idx]: idx for idx in range(len(brands_ids))}

        inputs_df = pd.read_csv(inputs_df_path).dropna()
        
        self.brands = inputs_df.groupby("user_id")["brand"].apply(list)
        inputs_ids_to_users = self.brands.index.values
        
        targets = np.load(targets_path)
        self.target_brands = targets["targets"]
        target_ids_to_users = targets["user_ids"]
        target_users_to_ids = {target_ids_to_users[idx]: idx for idx in range(len(target_ids_to_users))}
        
        self.inputs_ids_to_targets_ids = [target_users_to_ids[inputs_ids_to_users[idx]] for idx in range(len(inputs_ids_to_users))]
        
        

    def __len__(self):
        return len(self.inputs_ids_to_targets_ids)

    def __getitem__(self, idx: int):        
        brands = self.brands.iloc[idx]        
        brands = [self.brand_to_ids[brand] for brand in brands]        
        brands_codes = torch.from_numpy(self.absolute_codes[brands])
        user_sketch = torch.zeros(self.sketch_depth * self.sketch_width, dtype=torch.float32)
        for brand_codes in brands_codes:
            user_sketch *= self.sketch_decay
            user_sketch[brand_codes] += 1

        target_idx = self.inputs_ids_to_targets_ids[idx]
        target = self.target_brands[target_idx]
        return user_sketch, target



Since we are using PyTorch Lightning, we need to wrap our dataset in LightningDataModule.

In [105]:
class UserBrandDataModule(pl.LightningDataModule):
    def __init__(
        self,
        brands_ids: np.array,
        embeddings: np.array,
        inputs_df_path: str,
        targets_path: str,
        validation_targets_path: str,
        sketch_width: int,
        sketch_depth: int,
        batch_size: int,
        num_workers: int,
    ) -> None:
        super().__init__()
        self.brands_ids = brands_ids
        self.embeddings = embeddings
        self.inputs_df_path = inputs_df_path
        self.targets_path = targets_path
        self.validation_targets_path = validation_targets_path
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage) -> None:
        if stage == "fit":
            absolute_emde_codes = calculate_absolute_emde_codes(self.sketch_depth, self.sketch_width, self.embeddings)
            self.train_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                inputs_df_path=self.inputs_df_path,
                targets_path=self.targets_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )
            self.validation_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                inputs_df_path=self.inputs_df_path,
                targets_path=self.validation_targets_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_data, batch_size=self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.validation_data,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

# Defining simple feedforward Neural Network

Below we implement simple feedforward neural network with binary cross entropy loss and multilabel auroc as validation score.

In [106]:
class Model(pl.LightningModule):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
        learning_rate: float,
    ) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_dim),
        )
        self.val_auroc = AUROC(task="multilabel", num_labels=output_dim)


    def forward(self, x) -> Tensor:
        return self.linear_relu_stack(x)

    def configure_optimizers(self) -> optim.Optimizer:
        optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, train_batch, batch_idx) -> Tensor:
        x, y = train_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, val_batch, batch_idx) -> None:
        x, y = val_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.val_auroc(preds, y.long())
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, logger=True)
 
    def on_validation_epoch_end(self) -> None:
        self.log("val_auroc", self.val_auroc, prog_bar=True, on_epoch=True, logger=True)
 

# Training and results

Now we combine all these elements together into a piece of code which trains our model.

First we need calculate number of target brands, since this is the ouput size of our model.

In [107]:
num_target_brands = np.load(TARGETS_PATH)["targets"].shape[1]

Next we load embeddings and brands_ids.

In [108]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)

Now we are able to construct data module and model.

In [109]:
data = UserBrandDataModule(
    brands_ids=brands_ids,
    embeddings=embeddings,
    inputs_df_path=TRAIN_PD_PATH,
    targets_path=TARGETS_PATH,
    validation_targets_path=VALIDATION_TARGETS_PATH,
    sketch_width=SKETCH_WIDTH,
    sketch_depth=SKETCH_DEPTH,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

model = Model(
    input_dim=SKETCH_DEPTH * SKETCH_WIDTH, hidden_size=2048, output_dim=num_target_brands, learning_rate=LEARNING_RATE
)

We also want to print some useful messages concerning training progress, current loss and validation scores. In order to to do this we add some basic logger. 

In [110]:
logger = TensorBoardLogger(save_dir="logs", name=f"{EXPERIMENT_NAME}")

Finally we employ PyTorch Lightning Trainer class to wrap all configurations concerning training and validation together. 

In [111]:
trainer = pl.Trainer(
    accelerator=ACCELERATOR,
    devices=DEVICES,
    max_epochs=MAX_EPOCH,
    logger=logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Now by call to fit method on trainer with model and data as arguments in order to train and validate our pipeline. 

In [112]:
trainer.fit(model, data)



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type            | Params
------------------------------------------------------
0 | linear_relu_stack | Sequential      | 11.1 M
1 | val_auroc         | MultilabelAUROC | 0     
------------------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.278    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
