# Recommendation systems, models and approaches

### Lec 2:


1. ALS & PySpark
2. Attention models - STAMP
   1. Real-life examples
3. Transformers
4. Other approaches

# Dot product & ALS

<p align="center">
    <img src="./images/data_sparcity.webp" width=1200>
</p>

<p align="center">
    <img src="./images/Matrix factorization.webp" width=1200>
</p>

<p align="center">
    <img src="./images/pysparkALS.png" width=1200>
</p>

# Session-based - STAMP

- can't track long-time user interests
- great in online services like e-commerce and news portals, where most users either browse anonymously or may have very distinct interests for different sessions

<p align="center">
    <img src="./images/stamp.png" width=1000>
</p>

<p align="center">
    <img src="./images/attention.png" width=1000>
</p>

```python
    def build_model(self):
        """ Build the neural network
        """
        
        # Products in a session
        self.inputs = tf.placeholder(
            tf.int32,
            [None, None],
            name="inputs")

        # Last product in a session
        self.last_inputs = tf.placeholder(
            tf.int32,
            [None],
            name="last_inputs")

        batch_size = tf.shape(self.inputs)[0]

        self.sequence_length = tf.placeholder(
            tf.int64,
            [None],
            name='sequence_length')

        # True last products in sessions (one product per session)
        self.lab_input = tf.placeholder(
            tf.int32,
            [None],
            name="lab_input")

        # Embeddings that we learn during training
        self.embe_dict = tf.Variable(
            self.pre_embedding,
            dtype=tf.float32,
            trainable=self.emb_up)

        # This is only for filtering out padding
        self.pe_mask = tf.Variable(
            self.pre_embedding_mask,
            dtype=tf.float32,
            trainable=False)
        
        # Embeddings without padding
        self.embe_dict *= self.pe_mask
        
        # Create a variable for embeddings mask based on business rules
        self.business_rules_embeddings = tf.Variable(
            self.business_rules_embeddings_mask,
            dtype=tf.float32,
            trainable=False)
        
        # In the variable we have either zeros or true embeddings, so
        # we get recommendations scores that are either zero (for products
        # we don't want in recommendations) or true score
        self.business_rules_embeddings *= self.embe_dict
        
        sent_bitmap = tf.ones_like(tf.cast(self.inputs, tf.float32))

        # Get embedding for products in a session
        inputs = tf.nn.embedding_lookup(self.embe_dict, self.inputs,max_norm=1)
        
        # Get embeddings for last products
        lastinputs= tf.nn.embedding_lookup(self.embe_dict, self.last_inputs,max_norm=1)

        org_memory = inputs

        # Calculate m_s (average of embeddings see Figure 2 in paper)
        pool_out = pooler(
            org_memory,
            'mean',
            axis=1,
            sequence_length = tf.cast(tf.reshape(self.sequence_length,[batch_size, 1]), tf.float32))
        
        pool_out = tf.reshape(pool_out,[-1,self.hidden_size])

        # Apply attention
        attlayer = FwNnAttLayer(
            self.edim,
            active=self.active,
            stddev=self.stddev,
            norm_type='none')
        
        attout, alph= attlayer.forward(org_memory,lastinputs,pool_out,sent_bitmap)
        attout = tf.reshape(attout,[-1,self.edim]) + pool_out
        self.alph = tf.reshape(alph,[batch_size,1,-1])

        # MLP Cell A
        self.w1 = tf.Variable(
            tf.random_normal([self.edim, self.edim], stddev=self.stddev),
            trainable=True)

        # MLP Cell B
        self.w2 = tf.Variable(
            tf.random_normal([self.edim, self.edim], stddev=self.stddev),
            trainable=True)
        
        attout = tf.tanh(tf.matmul(attout,self.w1))
        # attout = tf.nn.dropout(attout, self.output_keep_probs)
        
        lastinputs= tf.tanh(tf.matmul(lastinputs,self.w2))
        # lastinputs= tf.nn.dropout(lastinputs, self.output_keep_probs)
        
        # Part of trilinear
        prod = attout * lastinputs
        
        # Scores of all products in embeddings dictionary
        sco_mat = tf.matmul(prod,self.embe_dict[1:], transpose_b= True)
        self.softmax_input = sco_mat
        
        # For inference, it's not used for training, but it's saved during training and
        # then loaded for inference
        sco_mat_inference = tf.matmul(prod, self.business_rules_embeddings[1:], transpose_b=True)
        self.softmax_input_inference = sco_mat_inference
        
        # Loss function
        self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=sco_mat, labels=self.lab_input)

        # Optimization of the loss function
        self.params = tf.trainable_variables()
        self.optimize = super(Seq2SeqAttNN, self).optimize_normal(self.loss, self.params)
```

## STAMP GIT repo

https://github.com/uestcnlp/STAMP

# Transformers for recommendations

```bash
conda create -n transformers4rec-23.04 -c nvidia -c rapidsai -c pytorch -c conda-forge \
    transformers4rec=23.04 `# NVIDIA Merlin` \
    nvtabular=23.04 `# NVIDIA Merlin - Used in example notebooks` \
    python=3.10 `# Compatible Python environment` \
    cudf=23.02 `# RAPIDS cuDF - GPU accelerated DataFrame` \
    cudatoolkit=11.8 pytorch-cuda=11.8 `# NVIDIA CUDA version`
```


<p align="center">
    <img src="./images/gru_based.png" width=1000>
</p>

<p align="center">
    <img src="./images/masking.png" width=1000>
</p>

```python
from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, RecallAt

# Create a schema or read one from disk: tr.Schema().from_json(SCHEMA_PATH).
schema: tr.Schema = tr.data.tabular_sequence_testing_data.schema

max_sequence_length, d_model = 20, 64

# Define the input module to process the tabular input features.
input_module = tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length=max_sequence_length,
    continuous_projection=d_model,
    aggregation="concat",
    masking="causal",
)

# Define a transformer-config like the XLNet architecture.
transformer_config = tr.XLNetConfig.build(
    d_model=d_model, n_head=4, n_layer=2, total_seq_length=max_sequence_length
)

# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    input_module,
    tr.MLPBlock([d_model]),
    tr.TransformerBlock(transformer_config, masking=input_module.masking)
)

# Define the evaluation top-N metrics and the cut-offs
metrics = [NDCGAt(top_ks=[20, 40], labels_onehot=True),
           RecallAt(top_ks=[20, 40], labels_onehot=True)]

# Define a head with NextItemPredictionTask.
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, metrics=metrics),
    inputs=input_module,
)

# Get the end-to-end Model class.
model = tr.Model(head)
```

## Examples

https://github.com/NVIDIA-Merlin/Transformers4Rec/tree/main/examples/tutorial