In [1]:
import torch

if torch.backends.mps.is_available():
    print("✅ MPS (Metal) backend is available and enabled.")
else:
    print("❌ MPS not available.")

✅ MPS (Metal) backend is available and enabled.


In [2]:
# simulation from chapter 1 
# !pip install numpy pandas scikit-learn tensorflow

import numpy as np
import pandas as pd

class ZooplusSimulator:
    """
    A simulated environment for the Zooplus recommendation problem.

    This class manages:
    1. A product catalog with features (category, base popularity).
    2. A set of user personas with distinct preferences.
    3. A stochastic reward function to simulate user clicks (CTR).
    """
    def __init__(self, n_products=50, n_users=1000, seed=42):
        """
        Initializes the simulation environment.
        
        Args:
            n_products (int): The total number of products in the catalog.
            n_users (int): The total number of unique users in the simulation.
            seed (int): Random seed for reproducibility.
        """
        self.rng = np.random.default_rng(seed)
        self.n_products = n_products
        self.n_users = n_users
        
        # 1. Create the Product Catalog
        self.products = self._create_product_catalog()
        self.product_features = self._get_product_features()
        
        # 2. Create User Personas and assign each of the n_users to a persona
        self.personas = self._create_user_personas()
        self.user_to_persona_map = self._assign_users_to_personas()

    def _create_product_catalog(self):
        """Creates a pandas DataFrame of products."""
        product_ids = range(self.n_products)
        categories = ['Dog Food', 'Cat Food', 'Dog Toy', 'Cat Toy', 'Fish Supplies']
        
        product_data = {
            'product_id': product_ids,
            'category': self.rng.choice(categories, self.n_products),
            # Base popularity score for each product (e.g., from global sales)
            'base_popularity': self.rng.uniform(0.1, 0.5, self.n_products)
        }
        return pd.DataFrame(product_data)

    def _get_product_features(self):
        """One-hot encodes product categories to create a feature matrix."""
        return pd.get_dummies(self.products['category'], prefix='cat').astype(float).values

    def _create_user_personas(self):
        """Defines a dictionary of user personas and their preferences."""
        return {
            'new_puppy_parent': {'Dog Food': 0.9, 'Dog Toy': 0.8, 'Cat Food': 0.1, 'Cat Toy': 0.1, 'Fish Supplies': 0.05},
            'cat_connoisseur':  {'Dog Food': 0.1, 'Dog Toy': 0.05, 'Cat Food': 0.9, 'Cat Toy': 0.85, 'Fish Supplies': 0.1},
            'budget_shopper':   {'Dog Food': 0.5, 'Dog Toy': 0.4, 'Cat Food': 0.5, 'Cat Toy': 0.4, 'Fish Supplies': 0.3},
            'fish_hobbyist':    {'Dog Food': 0.05, 'Dog Toy': 0.05, 'Cat Food': 0.1, 'Cat Toy': 0.1, 'Fish Supplies': 0.95}
        }
        
    def _assign_users_to_personas(self):
        """Randomly assigns each user ID to one of the defined personas."""
        persona_names = list(self.personas.keys())
        return {user_id: self.rng.choice(persona_names) for user_id in range(self.n_users)}

    def _sigmoid(self, x):
        """Sigmoid function to map a score to a probability."""
        return 1 / (1 + np.exp(-x))

    def get_reward(self, user_id, product_id):
        """
        Simulates a user-item interaction and returns a reward (1 for click, 0 for no-click).
        
        The click probability is a function of:
        - The user's affinity for the product's category.
        - The product's base popularity.
        - Random noise.
        """
        if user_id not in self.user_to_persona_map or product_id >= self.n_products:
            return 0 # Invalid user or product
            
        # Get user and product info
        persona_name = self.user_to_persona_map[user_id]
        persona_prefs = self.personas[persona_name]
        
        product_info = self.products.loc[product_id]
        product_category = product_info['category']
        product_popularity = product_info['base_popularity']
        
        # Calculate affinity score
        affinity = persona_prefs.get(product_category, 0.1) # Default affinity for unknown categories
        
        # Combine scores and add noise. We scale the scores to create a reasonable logit.
        logit = 3 * affinity + 1 * product_popularity - 2.5 # The constants are chosen to center the CTR
        
        # Introduce noise: some users might click on things they don't "like"
        logit += self.rng.normal(0, 0.5)
        
        # Convert logit to a probability
        click_prob = self._sigmoid(logit)
        
        # Sample from a Bernoulli distribution to get a stochastic outcome
        reward = self.rng.binomial(1, click_prob)
        
        return reward

    def get_user(self):
        """Returns a random user_id from the population."""
        return self.rng.integers(0, self.n_users)

# --- Example Usage ---
# Let's instantiate the simulator and see it in action
# sim = ZooplusSimulator(seed=42)
sim = ZooplusSimulator(seed=2)

# Get a random user
user_id = sim.get_user()
persona = sim.user_to_persona_map[user_id]
print(f"Simulating for User ID: {user_id}, who is a '{persona}'")

# Let's test this user's reaction to a few products
for product_id in [1, 2, 10, 11]:
    product_cat = sim.products.loc[product_id, 'category']
    reward = sim.get_reward(user_id, product_id)
    print(f"  - Recommended Product {product_id} (Category: {product_cat})... Clicked: {'Yes' if reward == 1 else 'No'}")

Simulating for User ID: 125, who is a 'new_puppy_parent'
  - Recommended Product 1 (Category: Cat Food)... Clicked: No
  - Recommended Product 2 (Category: Dog Food)... Clicked: Yes
  - Recommended Product 10 (Category: Fish Supplies)... Clicked: No
  - Recommended Product 11 (Category: Cat Toy)... Clicked: No


### **Chapter 2: The Adaptive Recommender: Contextual Bandits in Action**

#### **2.1 Introduction: Escaping the Static World with the Explore-Exploit Dilemma**

In the previous chapter, we built a capable, yet fundamentally flawed, batched recommender. Its knowledge is frozen in time, learned from a static dataset. It is like a student who has memorized a textbook but cannot apply that knowledge to new problems or learn from their mistakes. To build a truly intelligent system, we need to move from this passive, offline learning to an active, online paradigm.

This brings us face-to-face with one of the most fundamental trade-offs in decision-making and machine learning: the **explore-exploit dilemma**.

Imagine you are at a new food court with five stalls.
*   **Exploitation** is the safe bet. You try the pizza stall, and it's pretty good. The "exploit" strategy would be to eat pizza every single day. You are guaranteed a decent meal, maximizing your immediate reward based on your current knowledge.
*   **Exploration** is the risky, but potentially more rewarding, path. You could try the mysterious taco stall. It might be terrible (a loss of immediate reward), but it could also be the best food you've ever had, leading to much higher rewards in the long run.

Our batched recommender is a pure exploiter. Once trained, it will always recommend the items it *believes* have the highest CTR, based on its fixed knowledge. It never dares to try the "taco stall"—a new product or a niche item—because its predicted CTR is low or unknown.

To build a better system, we need an algorithm that can intelligently manage this trade-off. This is the domain of **Multi-Armed Bandits**, a class of reinforcement learning algorithms designed specifically for this problem. The name comes from the analogy of a gambler at a row of slot machines (or "one-armed bandits"), trying to figure out which machine to play to maximize their total winnings.

We will take this concept one step further by using a **Contextual Bandit**. A simple multi-armed bandit learns the best "arm" (or product) to pull on average, across all situations. A contextual bandit is far more powerful: it learns the best arm to pull *given the current context*. In our Zooplus scenario, the **context** is the user. The algorithm doesn't just learn "which product is best overall?"; it learns "which product is best for *this specific user* right now?".

This chapter will introduce a classic, elegant, and highly effective contextual bandit algorithm: the **Linear Upper Confidence Bound (LinUCB)** algorithm. We will implement it, pit it against our static batched model in our simulation, and witness firsthand the power of continuous, adaptive learning.

#### **2.2 The taste of advanced techniques: The Linear Upper Confidence Bound (LinUCB) Algorithm**

The LinUCB algorithm, first introduced by Li et al. (2010) for news article recommendation, strikes a beautiful balance between performance, efficiency, and interpretability. It is a perfect entry point into the world of online, reinforcement learning-based recommenders.

**The Core Assumption: A Linear World**

LinUCB makes a simplifying (for nolinear cases we will investigate NeuralUCB and the likes in subsequent chapters) assumption: the expected reward (the true, underlying CTR) of showing a product to a user is a **linear function** of a combined feature vector.

Let's say for a given user-product pair, we can construct a feature vector, `x`. This vector could include:
*   User features (e.g., one-hot encoding of their persona)
*   Product features (e.g., one-hot encoding of the product's category)
*   Interaction features (e.g., the product of user and item embeddings)

The algorithm assumes there exists an unknown coefficient vector, `θ`, such that the expected reward, `E[r]`, is simply their dot product:

`E[r] = x^T θ`

The entire goal of the LinUCB algorithm is to **learn the `θ` vector** for each product as efficiently as possible.

**How LinUCB Balances Exploration and Exploitation**

For each "arm" (i.e., each product in our catalog), LinUCB maintains two key pieces of information:

1.  **A `d x d` matrix `A`**: This matrix stores information about the feature vectors `x` it has seen so far for that arm. It's essentially `X^T X`, where `X` is the matrix of all feature vectors observed for that arm. The inverse of `A` helps us measure our uncertainty about the arm's true reward.
2.  **A `d x 1` vector `b`**: This vector stores the sum of the feature vectors `x` weighted by the rewards `r` they produced. It's `X^T r`.

At each step, to make a recommendation, LinUCB calculates a score for every possible product using these two pieces of information. The score is composed of two parts:

`Score = (Predicted CTR) + (Uncertainty Bonus)`

1.  **Predicted CTR (Exploitation):** The algorithm first calculates its current best estimate of the coefficient vector, `θ_hat = A⁻¹ b`. The predicted CTR is then simply `x^T θ_hat`. This is the exploitation term—it favors products that have performed well in the past.

2.  **Uncertainty Bonus (Exploration):** The second term is `α * sqrt(x^T A⁻¹ x)`.
    *   `A⁻¹` represents the covariance of our estimate for `θ_hat`. A large value means we are very uncertain about our estimate.
    *   `x^T A⁻¹ x` gives us the variance of the prediction specifically for the feature vector `x`. If we have seen feature vectors similar to `x` many times before, this term will be small. If `x` represents a new, unseen combination of user and product features, this term will be large.
    *   `α` (alpha) is a hyperparameter that you control. It scales how much the algorithm values exploration. A higher `α` makes the algorithm more adventurous.

The algorithm then simply chooses the product with the **highest combined score**.

**The Intuition:**

*   If a product has a high predicted CTR and we are very certain about it (low uncertainty bonus), it gets a high score. **(Pure Exploitation)**
*   If a product has a mediocre predicted CTR but we are very uncertain about it (high uncertainty bonus), it can also get a high score. Choosing this product is an act of **exploration**. By trying it, we get a new data point, which reduces our uncertainty (updating `A` and `b`) and helps us learn its true value for the future.

This elegant combination allows LinUCB to learn efficiently. It focuses its exploration on the parts of the feature space where its knowledge is weakest, leading to rapid convergence.

#### **2.3 Implementing the LinUCB Agent for Zooplus**

Now, let's translate this theory into practice. We will create a Python class for a single LinUCB "arm" (representing one product) and a main agent class that manages all the arms.

For our feature vector `x`, we will do something simple and effective: we will **concatenate the user's embedding with the product's category features**. But where do we get a user embedding for a contextual bandit? We can't use the one from the batched model directly, as it was trained for a different task.

Instead, we will create a new set of user embeddings, one for each *persona*. This is a reasonable simplification for our simulation. In a real system, these could be embeddings learned from user demographics or other side information. The product features will be the one-hot encoded categories we already have in our simulator.

**Code Block 2.1: The LinUCB Implementation**

```python
import numpy as np

class LinUCBArm:
    """Represents a single arm in the LinUCB algorithm."""
    def __init__(self, arm_index, d, alpha):
        """
        Args:
            arm_index (int): The index of the arm (e.g., product_id).
            d (int): The dimensionality of the feature vector.
            alpha (float): The exploration parameter.
        """
        self.arm_index = arm_index
        self.alpha = alpha
        
        # Initialize A as a d x d identity matrix.
        # This corresponds to a standard Bayesian linear regression prior.
        self.A = np.identity(d)
        
        # Initialize b as a d x 1 zero vector.
        self.b = np.zeros([d, 1])

    def calc_p(self, x):
        """
        Calculates the score for this arm given a feature vector x.
        
        Args:
            x (np.array): A d-dimensional feature vector.
        
        Returns:
            The UCB score for this arm.
        """
        # Ensure x is a column vector
        x = x.reshape(-1, 1)
        
        # Calculate A_inv and theta_hat
        A_inv = np.linalg.inv(self.A)
        theta_hat = A_inv.dot(self.b)
        
        # Calculate the UCB score
        p = theta_hat.T.dot(x) + self.alpha * np.sqrt(x.T.dot(A_inv).dot(x))
        
        return p

    def update(self, x, reward):
        """
        Updates the A and b matrices for this arm.
        
        Args:
            x (np.array): The feature vector for the interaction.
            reward (int): The observed reward (0 or 1).
        """
        x = x.reshape(-1, 1)
        self.A += x.dot(x.T)
        self.b += reward * x

class LinUCBAgent:
    """The main agent that manages all the LinUCB arms."""
    def __init__(self, n_products, user_features, product_features, alpha=1.0):
        """
        Args:
            n_products (int): The number of arms (products).
            user_features (dict): A dict mapping persona name to a feature vector.
            product_features (np.array): A matrix of one-hot encoded product categories.
            alpha (float): The exploration parameter.
        """
        self.user_features = user_features
        self.product_features = product_features
        self.n_products = n_products
        
        # The dimensionality of our combined feature vector
        d = list(user_features.values())[0].shape[0] + product_features.shape[1]
        
        # Create a list of arms
        self.arms = [LinUCBArm(i, d, alpha) for i in range(n_products)]

    def _create_feature_vector(self, persona, product_id):
        """Creates the concatenated feature vector x."""
        user_feat = self.user_features[persona]
        product_feat = self.product_features[product_id]
        return np.concatenate([user_feat, product_feat])

    def choose_action(self, user_persona):
        """
        Chooses the best product to recommend for the given user persona.
        
        Returns:
            The product_id of the chosen action.
        """
        scores = []
        for product_id in range(self.n_products):
            # Create the feature vector for this user-product pair
            x = self._create_feature_vector(user_persona, product_id)
            
            # Calculate the score for this arm
            score = self.arms[product_id].calc_p(x)
            scores.append(score)
            
        # Choose the arm with the highest score (break ties randomly)
        max_score = np.max(scores)
        best_arms = np.where(scores == max_score)[0]
        chosen_arm = np.random.choice(best_arms)
        
        return chosen_arm

    def update(self, chosen_arm, user_persona, reward):
        """Updates the agent after an action is taken."""
        x = self._create_feature_vector(user_persona, chosen_arm)
        self.arms[chosen_arm].update(x, reward)

# --- Setup for the LinUCB Agent ---

# Create simple, random embeddings for our user personas
persona_embedding_dim = 8
user_features_for_bandit = {
    name: np.random.rand(persona_embedding_dim) 
    for name in sim.personas.keys()
}

# The product features are the one-hot encoded categories from the simulator
product_features_for_bandit = sim.product_features

# Instantiate the agent
linucb_agent = LinUCBAgent(
    n_products=sim.n_products,
    user_features=user_features_for_bandit,
    product_features=product_features_for_bandit,
    alpha=1.5 # Let's be a bit adventurous
)

print("LinUCB Agent created successfully.")
d = list(user_features_for_bandit.values())[0].shape[0] + product_features_for_bandit.shape[1]
print(f"Feature vector dimensionality (d): {d}")
```

With the agent class defined and instantiated, we are now ready for the main event: a head-to-head competition. We will create a simulation loop that puts our new, adaptive `LinUCBAgent` against the static, pre-trained `MLPRecommender` from Chapter 1. This will allow us to see, step-by-step, how an online learning agent behaves compared to its offline counterpart.

In [3]:
import numpy as np

class LinUCBArm:
    """Represents a single arm in the LinUCB algorithm."""
    def __init__(self, arm_index, d, alpha):
        """
        Args:
            arm_index (int): The index of the arm (e.g., product_id).
            d (int): The dimensionality of the feature vector.
            alpha (float): The exploration parameter.
        """
        self.arm_index = arm_index
        self.alpha = alpha
        
        # Initialize A as a d x d identity matrix.
        # This corresponds to a standard Bayesian linear regression prior.
        self.A = np.identity(d)
        
        # Initialize b as a d x 1 zero vector.
        self.b = np.zeros([d, 1])

    def calc_p(self, x):
        """
        Calculates the score for this arm given a feature vector x.
        
        Args:
            x (np.array): A d-dimensional feature vector.
        
        Returns:
            The UCB score for this arm.
        """
        # Ensure x is a column vector
        x = x.reshape(-1, 1)
        
        # Calculate A_inv and theta_hat
        A_inv = np.linalg.inv(self.A)
        theta_hat = A_inv.dot(self.b)
        
        # Calculate the UCB score
        p = theta_hat.T.dot(x) + self.alpha * np.sqrt(x.T.dot(A_inv).dot(x))
        
        return p

    def update(self, x, reward):
        """
        Updates the A and b matrices for this arm.
        
        Args:
            x (np.array): The feature vector for the interaction.
            reward (int): The observed reward (0 or 1).
        """
        x = x.reshape(-1, 1)
        self.A += x.dot(x.T)
        self.b += reward * x

class LinUCBAgent:
    """The main agent that manages all the LinUCB arms."""
    def __init__(self, n_products, user_features, product_features, alpha=1.0):
        """
        Args:
            n_products (int): The number of arms (products).
            user_features (dict): A dict mapping persona name to a feature vector.
            product_features (np.array): A matrix of one-hot encoded product categories.
            alpha (float): The exploration parameter.
        """
        self.user_features = user_features
        self.product_features = product_features
        self.n_products = n_products
        
        # The dimensionality of our combined feature vector
        d = list(user_features.values())[0].shape[0] + product_features.shape[1]
        
        # Create a list of arms
        self.arms = [LinUCBArm(i, d, alpha) for i in range(n_products)]

    def _create_feature_vector(self, persona, product_id):
        """Creates the concatenated feature vector x."""
        user_feat = self.user_features[persona]
        product_feat = self.product_features[product_id]
        return np.concatenate([user_feat, product_feat])

    def choose_action(self, user_persona):
        """
        Chooses the best product to recommend for the given user persona.
        
        Returns:
            The product_id of the chosen action.
        """
        scores = []
        for product_id in range(self.n_products):
            # Create the feature vector for this user-product pair
            x = self._create_feature_vector(user_persona, product_id)
            
            # Calculate the score for this arm
            score = self.arms[product_id].calc_p(x)
            scores.append(score)
            
        # Choose the arm with the highest score (break ties randomly)
        max_score = np.max(scores)
        best_arms = np.where(scores == max_score)[0]
        chosen_arm = np.random.choice(best_arms)
        
        return chosen_arm

    def update(self, chosen_arm, user_persona, reward):
        """Updates the agent after an action is taken."""
        x = self._create_feature_vector(user_persona, chosen_arm)
        self.arms[chosen_arm].update(x, reward)

# --- Setup for the LinUCB Agent ---

# Create simple, random embeddings for our user personas
persona_embedding_dim = 8
user_features_for_bandit = {
    name: np.random.rand(persona_embedding_dim) 
    for name in sim.personas.keys()
}

# The product features are the one-hot encoded categories from the simulator
product_features_for_bandit = sim.product_features

# Instantiate the agent
linucb_agent = LinUCBAgent(
    n_products=sim.n_products,
    user_features=user_features_for_bandit,
    product_features=product_features_for_bandit,
    alpha=1.5 # Let's be a bit adventurous
)

print("LinUCB Agent created successfully.")
d = list(user_features_for_bandit.values())[0].shape[0] + product_features_for_bandit.shape[1]
print(f"Feature vector dimensionality (d): {d}")

LinUCB Agent created successfully.
Feature vector dimensionality (d): 13


**A Critical Note on Scalability: From Disjoint to Global Models**

As a person moving from theory to practice, it is natural to ask: how does this scale to a catalog of 10,000 or 1,000,000 products? A naive implementation where each product is an independent "arm" with its own parameters to learn would fail for two reasons: it would be too slow to score every item, and it would require too much data to learn about every single item individually.

The industry-standard solution involves two key ideas:
1.  **Two-Stage Recommendation:** A fast *candidate generation* model first selects a few hundred relevant items from the vast catalog. Then, a sophisticated *ranking* model, like our bandit, scores only this small candidate set.
2.  **Parameter Sharing:** Instead of learning a separate model for each arm (a "disjoint" model), we learn one **single, global model** that is shared across all arms. This allows the model to generalize. Learning that a user likes one brand of puppy food immediately informs the model's predictions for *all* brands of puppy food, because they share common features (e.g., `category=Dog Food`).

We will now implement this more scalable "Global LinUCB" model. Notice that this is actually simpler: we no longer need a separate class for each arm. The agent itself will manage a single set of parameters.

#### **2.4 Implementing a Scalable LinUCB Agent**

Let's build our agent. It will maintain a single `A` matrix and `b` vector. When asked to choose an action, it will score a list of candidate products (for our simulation, this will be the *entire* catalog, but in a real system, it would be a smaller set) and pick the one with the highest UCB score.

**Code Block 2.2: The Scalable Global LinUCB Agent**

```python
import numpy as np

class GlobalLinUCBAgent:
    """A single, global LinUCB model that shares parameters across all arms."""
    def __init__(self, d, alpha=1.0):
        """
        Args:
            d (int): The dimensionality of the feature vector.
            alpha (float): The exploration parameter.
        """
        self.d = d
        self.alpha = alpha
        
        # Initialize A and b for the single, global model
        self.A = np.identity(d)
        self.b = np.zeros([d, 1])
        self.theta_hat = np.zeros([d, 1])
        self.A_inv = np.identity(d)

    def _update_theta(self):
        """Internal method to recalculate theta_hat after an update."""
        self.A_inv = np.linalg.inv(self.A)
        self.theta_hat = self.A_inv.dot(self.b)

    def choose_action(self, user_features, product_features_matrix):
        """
        Chooses the best product to recommend for the given user.
        
        Args:
            user_features (np.array): The feature vector for the current user.
            product_features_matrix (np.array): A matrix where each row is the 
                                                feature vector for a product.
        
        Returns:
            The product_id of the chosen action.
        """
        n_candidates = product_features_matrix.shape[0]
        
        # Create the full feature matrix for all candidates
        # We tile the user features and concatenate them with the product features
        user_features_tiled = np.tile(user_features, (n_candidates, 1))
        full_feature_matrix = np.concatenate([user_features_tiled, product_features_matrix], axis=1)
        
        # Calculate scores for all candidates in a vectorized way
        predicted_rewards = full_feature_matrix @ self.theta_hat
        
        # Calculate uncertainty bonus for all candidates
        # This is the most computationally intensive part
        uncertainty = np.sqrt(
            np.sum((full_feature_matrix @ self.A_inv) * full_feature_matrix, axis=1)
        )
        
        scores = predicted_rewards.flatten() + self.alpha * uncertainty
        
        # Choose the arm with the highest score
        chosen_arm = np.argmax(scores)
        
        return chosen_arm

    def update(self, x, reward):
        """
        Updates the global A and b matrices.
        
        Args:
            x (np.array): The feature vector for the chosen user-item interaction.
            reward (int): The observed reward (0 or 1).
        """
        x = x.reshape(-1, 1)
        self.A += x.dot(x.T)
        self.b += reward * x
        self._update_theta() # Recalculate theta after the update

# --- Helper function to create the feature vector ---
def create_feature_vector(user_persona, product_id, user_features_map, product_features_matrix):
    """Creates the concatenated feature vector for a single (user, product) pair."""
    user_feat = user_features_map[user_persona]
    product_feat = product_features_matrix[product_id]
    return np.concatenate([user_feat, product_feat])

# --- Setup for the LinUCB Agent ---
persona_embedding_dim = 8
user_features_for_bandit = {
    name: np.random.rand(persona_embedding_dim) 
    for name in sim.personas.keys()
}
product_features_for_bandit = sim.product_features
d = persona_embedding_dim + product_features_for_bandit.shape[1]

# Instantiate the scalable agent
linucb_agent = GlobalLinUCBAgent(d=d, alpha=1.5)

print("Scalable Global LinUCB Agent created successfully.")
print(f"Feature vector dimensionality (d): {d}")
```
This new agent is now ready for the online arena. Notice how much cleaner it is—we manage one set of parameters, and the `choose_action` method is fully vectorized for efficiency. We are now prepared for the head-to-head comparison.