In [None]:
!git clone https://github.com/Synerise/predicting-user-behavior-workshop.git
%cd predicting-user-behavior-workshop

In [1]:
!pip install lightning
!pip install torchmetrics
!pip install cleora_saas_api

Looking in indexes: https://pypi.org/simple, http://artifactory.service/api/pypi/synePy/simple
Looking in indexes: https://pypi.org/simple, http://artifactory.service/api/pypi/synePy/simple


In [28]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim, Tensor
from torchmetrics import AveragePrecision, AUROC
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

from emde import calculate_absolute_emde_codes
from cleora_saas_api import CLI


# Constants
First we define all constants that will be used in this notebook. 

In [29]:
DATA_DIR = "./data/"
TRAIN_PD_PATH = os.path.join(DATA_DIR, "2019-Oct_small.csv")
TARGETS_PATH = os.path.join(DATA_DIR,"train_target.npy")
VALIDATION_TARGETS_PATH = os.path.join(DATA_DIR,"test_target.npy")
USER_IDS = os.path.join(DATA_DIR, "user_ids.npy")
CLEORA_INPUT_FILE = os.path.join(DATA_DIR,"cleora_input.tsv")
EMBEDDINGS_NPZ = "embeddings.npz"
SKETCH_DEPTH = 20
SKETCH_WIDTH = 64
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
MAX_EPOCH = 1
ACCELERATOR = "gpu"
DEVICES = 1
NUM_WORKERS = 8
EXPERIMENT_NAME = "experiment_with_brands"
CLEORA_API_TOKEN = "AMf-vByg7WAB7GhZsOpQU_PisBJINqw-IRncFm182Ly3R7JyUxRY0JIjM9hJXoNJ9Q9ceHwWUn0Ghc60J2jrxJVyUyZyf5mQLUElpb9DJbd5q-PXHNjE_QHRXAPEKNX2relRJycP6FOw2fxf8fngHEw6CvLS44nbuxIhTDd_b1w8JNkhaPIr-8GOJAL8OlV06cEmf6iJZnLqDSkIV5msh6WaUQRV0canHZb1o30SmRBxawHYs2-n7xFEDOO-H_ULoCLVDZHBlnA1ewnEqYMQkpNWc32atG-8HmitYWIG-P-OClL8YOh53Oy95kFIN6A4u2CK46KUN9qd1edD3gFvl_vjg9auC3ZwzQ"



# Calculating embeddings with Cleora
The following function prepares input for cleora. First we load DataFrame with training data. Note that cleora works with timestamps as well. However, in our case we drop timestamps for simplicity. Finally we save the result as a tsv file, which is required input format for cleora. 

In [30]:
train_df = pd.read_csv(TRAIN_PD_PATH)
train_df.head(10)

Unnamed: 0,event_time,brand,user_id
0,2019-10-01 00:02:14 UTC,samsung,543272936
1,2019-10-01 00:04:37 UTC,apple,551377651
2,2019-10-01 00:05:14 UTC,xiaomi,550121407
3,2019-10-01 00:06:02 UTC,xiaomi,514591159
4,2019-10-01 00:07:07 UTC,santeri,555332717
5,2019-10-01 00:09:26 UTC,apple,524601178
6,2019-10-01 00:09:33 UTC,apple,524325294
7,2019-10-01 00:09:54 UTC,apple,551377651
8,2019-10-01 00:10:08 UTC,apple,524325294
9,2019-10-01 00:10:56 UTC,oasis,548691404


In [31]:
def prepare_cleora_input_file():
    train_df = pd.read_csv(TRAIN_PD_PATH)
    train_df.drop("event_time", axis=1, inplace=True)
    train_df = train_df[["user_id", "brand"]]
    with open(CLEORA_INPUT_FILE, "w") as tsv_file:
        train_df.to_csv(tsv_file, sep="\t", header=True, index=False)

In [32]:
prepare_cleora_input_file()

In [33]:
cleora = CLI()
cleora.login(CLEORA_API_TOKEN)

logged in successfully


In [34]:

cleora.run(256, 3, input_path=os.path.join(DATA_DIR, "cleora_input.tsv"), run_name="colab_clustering")

-- Start --


-- Config to trigger run prepared --
-- Run started --
-- Logs: --
Started at 2024-03-21 09:54:42
Downloading input file.
Input file downloaded with 1592254 rows.
Number of rows in original data: 1592254
Number of rows after preprocessing: 1592254
Initializing Cleora.
Iteration 1/3 done
Iteration 2/3 done
Saving results.
-- Result download started --
-- Result download finished --


# Exercise 1

Instead of using 3 iterations with dimension 256 set the number of iterations to 4 and dimension to 128.

In [35]:
# @title Solution

cleora.run(128, 4, input_path=os.path.join(DATA_DIR, "cleora_input.tsv"), run_name="colab_clustering")

-- Start --


-- Config to trigger run prepared --
-- Run started --
-- Logs: --
Started at 2024-03-21 09:55:04
Downloading input file.
Number of rows in original data: 1592254
Number of rows after preprocessing: 1592254
Number of embedded nodes: 2012
Initializing Cleora.
Iteration 1/4 done
Iteration 2/4 done
Iteration 3/4 done
Iteration 4/4 done
Saving results.
-- Result download started --
-- Result download finished --


The following function is used to load embeddings.

In [36]:
def load_embeddings(embeddings_path: str):
    embeddings = np.load(embeddings_path)
    return embeddings["entity_id"], embeddings["vectors"]

We explain now the output of cleora.ai app.

In [37]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)
print(f"embeddings shape: {embeddings.shape}, embeddings dtype: {embeddings.dtype}")
print(f"brands_ids shape: {brands_ids.shape}")

embeddings shape: (2012, 128), embeddings dtype: float32
brands_ids shape: (2012,)


In [38]:
brands_ids[:10]

array(['autoprofi', 'plantronics', 'armani', 'vlk', 'alilo', 'hobot',
       'nitecore', 'garrett', 'veston', 'silva'], dtype='<U28')

Let us find brand which corresponds to some index and then print its embedding.

In [39]:
idx = 2
brands_ids[idx]

'armani'

In [40]:
embeddings[idx]

array([ 1.5107129 , -0.61800104,  0.29149207,  1.3095465 , -1.1886827 ,
       -0.8919559 ,  0.8049202 , -0.22758974,  0.8502962 ,  0.34270987,
       -1.1206579 ,  0.25124353, -0.36458215, -1.1378546 ,  0.30196458,
       -1.0979255 ,  0.09139225, -1.6743965 , -0.55483943,  1.1998879 ,
        1.6132466 , -0.98391545,  0.90452343, -1.2046206 ,  0.20919831,
       -0.12920149,  0.35877952, -1.7263443 ,  1.2457936 , -0.06365689,
        1.4215299 ,  0.30695638, -0.21221425, -1.2050424 , -0.07868162,
        0.02196854, -1.5900156 ,  1.0559971 ,  1.2693866 ,  1.0931486 ,
        1.4882509 ,  0.3698465 ,  0.5585682 , -0.7387333 , -0.7295028 ,
        0.25691432, -1.1481808 , -0.9472913 , -0.14610769, -0.23547064,
        0.14512831,  0.77659047, -0.3075655 ,  0.6522249 ,  0.6909754 ,
       -1.2155725 , -0.02416053,  0.93847895,  2.286283  , -1.6497765 ,
        0.20822202, -0.3423906 , -0.9275416 , -1.3755951 ,  1.1066705 ,
       -1.2165293 , -0.80357516, -0.26762888,  0.4001226 , -0.30

# Implementing Dataset class

We explain here some details related to our implementation of Dataset class.

First we investigate the contents of training DataFrame.

In [41]:
train_df = pd.read_csv(TRAIN_PD_PATH)
train_df.head(10)

Unnamed: 0,event_time,brand,user_id
0,2019-10-01 00:02:14 UTC,samsung,543272936
1,2019-10-01 00:04:37 UTC,apple,551377651
2,2019-10-01 00:05:14 UTC,xiaomi,550121407
3,2019-10-01 00:06:02 UTC,xiaomi,514591159
4,2019-10-01 00:07:07 UTC,santeri,555332717
5,2019-10-01 00:09:26 UTC,apple,524601178
6,2019-10-01 00:09:33 UTC,apple,524325294
7,2019-10-01 00:09:54 UTC,apple,551377651
8,2019-10-01 00:10:08 UTC,apple,524325294
9,2019-10-01 00:10:56 UTC,oasis,548691404


We group train Dataframe by user and aggregate obtained groups by applying list construtor. This constructs Series that contains list of interactions of every user.

In [42]:
brands = train_df.groupby("user_id", group_keys=True)["brand"].apply(list)
brands.head(10)

user_id
264649825        [kiturami, kiturami]
284344819                     [apple]
293957954                    [xiaomi]
303160429                    [garmin]
304325717    [huawei, huawei, huawei]
318611205              [huawei, zeta]
336595257          [samsung, samsung]
340041246        [lg, lg, lg, lg, lg]
348815209                   [samsung]
362327778                     [apple]
Name: brand, dtype: object

Now we are ready to implement our custom dataset class.

In [43]:
class UsersBrandsDataset(Dataset):
    def __init__(
        self,
        absolute_codes: np.ndarray,
        brands_ids: np.ndarray,
        train_df_path: str,
        targets_path: str,
        user_ids_path: str,
        sketch_width: int,
        sketch_depth: int,
        sketch_decay: float = 0.94,
    ):
        """
        Args:
            absolute_codes (np.ndarray): Array of shape (num_brands, sketch_depth) containing the absolute codes for each item
            brands_ids (np.ndarray): Array of shape (num_brands) mapping each idx to corresponding brand 
            inputs_df_path (str): path to train dataframe
            targets_path (str): path to targets array
            sketch_width (int): width of the sketch
            sketch_depth (int): depth of the sketch
            sketch_decay (float): Decay factor for the sketch
        """
        self.absolute_codes = absolute_codes
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.sketch_decay = sketch_decay
        
        self.brand_to_ids = {brands_ids[idx]: idx for idx in range(len(brands_ids))}

        train_df = pd.read_csv(train_df_path)
        
        self.brands = train_df.groupby("user_id")["brand"].apply(list)
        self.users_ids = np.load(user_ids_path)
        self.target_brands = np.load(targets_path)
        

    def __len__(self):
        return len(self.users_ids)

    def __getitem__(self, idx: int):
        try:
            brands = self.brands.iloc[idx]
        except IndexError:
            print(f"WRONG IDS IS {idx}")        
        brands = [self.brand_to_ids[brand] for brand in brands]        
        brands_codes = torch.from_numpy(self.absolute_codes[brands])
        user_sketch = torch.zeros(self.sketch_depth * self.sketch_width, dtype=torch.float32)
        for brand_codes in brands_codes:
            user_sketch *= self.sketch_decay
            user_sketch[brand_codes] += 1
                    
        target = self.target_brands[idx]
        return user_sketch, target



Since we are using PyTorch Lightning, we need to wrap our dataset in LightningDataModule.

In [44]:
class UserBrandDataModule(pl.LightningDataModule):
    def __init__(
        self,
        brands_ids: np.array,
        embeddings: np.array,
        train_df_path: str,
        targets_path: str,
        validation_targets_path: str,
        user_ids_path: str,
        sketch_width: int,
        sketch_depth: int,
        batch_size: int,
        num_workers: int,
    ) -> None:
        super().__init__()
        self.brands_ids = brands_ids
        self.embeddings = embeddings
        self.train_df_path = train_df_path
        self.targets_path = targets_path
        self.validation_targets_path = validation_targets_path
        self.user_ids_path = user_ids_path
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage) -> None:
        if stage == "fit":
            absolute_emde_codes = calculate_absolute_emde_codes(self.sketch_depth, self.sketch_width, self.embeddings)
            self.train_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                train_df_path=self.train_df_path,
                targets_path=self.targets_path,
                user_ids_path=self.user_ids_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )
            self.validation_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                train_df_path=self.train_df_path,
                targets_path=self.validation_targets_path,
                user_ids_path=self.user_ids_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_data, batch_size=self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.validation_data,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

# Defining simple feedforward Neural Network

Below we implement simple feedforward neural network with binary cross entropy loss and multilabel auroc as validation score.

In [45]:
class Net(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
    ) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_dim),
        )

    def forward(self, x) -> torch.Tensor:
        return self.linear_relu_stack(x)
        
    

In [46]:
class Model(pl.LightningModule):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
        learning_rate: float,
    ) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.net = Net(hidden_size=hidden_size, input_dim=input_dim, output_dim=output_dim)
        self.val_auroc = AUROC(task="multilabel", num_labels=output_dim)

    def forward(self, x) -> Tensor:
        return self.net(x)

    def configure_optimizers(self) -> optim.Optimizer:
        optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, train_batch, batch_idx) -> Tensor:
        x, y = train_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, val_batch, batch_idx) -> None:
        x, y = val_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.val_auroc(preds, y.long())
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, logger=True)

    def on_validation_epoch_end(self) -> None:
        self.log("val_auroc", self.val_auroc, prog_bar=True, on_epoch=True, logger=True)

# Training and results

Now we combine all these elements together into a piece of code which trains our model.

First we need calculate number of target brands, since this is the ouput size of our model.

In [47]:
num_target_brands = np.load(TARGETS_PATH).shape[1]

Next we load embeddings and brands_ids.

In [48]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)

Now we are able to construct data module and model.

In [49]:
data = UserBrandDataModule(
    brands_ids=brands_ids,
    embeddings=embeddings,
    train_df_path=TRAIN_PD_PATH,
    targets_path=TARGETS_PATH,
    validation_targets_path=VALIDATION_TARGETS_PATH,
    user_ids_path=USER_IDS,
    sketch_width=SKETCH_WIDTH,
    sketch_depth=SKETCH_DEPTH,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

model = Model(
    input_dim=SKETCH_DEPTH * SKETCH_WIDTH, hidden_size=2048, output_dim=num_target_brands, learning_rate=LEARNING_RATE
)

We also want to print some useful messages concerning training progress, current loss and validation scores. In order to to do this we add some basic logger. 

In [50]:
logger = TensorBoardLogger(save_dir="logs", name=f"{EXPERIMENT_NAME}")

Finally we employ PyTorch Lightning Trainer class to wrap all configurations concerning training and validation together. 

In [51]:
trainer = pl.Trainer(
    accelerator=ACCELERATOR,
    devices=DEVICES,
    max_epochs=MAX_EPOCH,
    logger=logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Now by call to fit method on trainer with model and data as arguments in order to train and validate our pipeline. 

In [52]:
trainer.fit(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type            | Params
----------------------------------------------
0 | net       | Net             | 11.1 M
1 | val_auroc | MultilabelAUROC | 0     
----------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.278    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


# Exercise 2
Consider the following code, which adds additional linear layer on top of previously defined model. 

    class DeepNet(torch.nn.Module):
        def __init__(
            self,
            input_dim: int,
            hidden_size: int,
            output_dim: int,
        ) -> None:
            super().__init__()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(input_dim, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
            )

        def forward(self, x) -> torch.Tensor:
            return self.linear_relu_stack(x)

Replace Net with DeepNet in appropriate cell above and try to run the the training.

**Q1:** Do you know what went wrong?
**Q2:** Can you fix it?

In [53]:
# @title Solution

# output of the sequential network has incorrect dimension!!!


class DeepNet(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
    ) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_dim),
        )

    def forward(self, x) -> torch.Tensor:
        return self.linear_relu_stack(x)