# Contextual Bandit Based on Upper Confidence Bound for E-commerce Platform

This is a contextual bandit algorithm designed for an e-commerce platform with four contexts or channels. There are five models that run in each context. Model selection is based on the Upper Confidence Bound (UCB) algorithm.

Author: Okwudili Ezeme
Date: 2021-10-15


In [None]:
import numpy as np

class ContextualBandit:
    """A contextual bandit algorithm designed for an e-commerce platform with four contexts or channels.
    
    Attributes:
        n_channels (int): The number of channels or contexts.
        n_models (int): The number of models to run in each context.
        rewards (numpy.ndarray): A 2D array of rewards for each channel and model.
    """
    def __init__(self):
        self.n_channels = 4
        self.n_models = 5
        self.rewards = np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
                                 [0.5, 0.4, 0.3, 0.2, 0.1],
                                 [0.2, 0.3, 0.5, 0.1, 0.4],
                                 [0.4, 0.1, 0.2, 0.3, 0.5]])
    
    def get_reward(self, channel, model):
        """Get the reward for a given channel and model.
        
        Args:
            channel (int): The channel or context.
            model (int): The model to select.
            
        Returns:
            int: The reward for the selected model.
        """
        reward = 0
        if np.random.rand() < self.rewards[channel][model]:
            reward = 1
        return reward
    
class UCB:
    """An implementation of the Upper Confidence Bound (UCB) algorithm.
    
    Attributes:
        n_models (int): The number of models to select from.
        model_counts (numpy.ndarray): An array of counts for each model.
        model_rewards (numpy.ndarray): An array of rewards for each model.
        total_counts (int): The total number of counts across all models.
    """
    def __init__(self, n_models):
        self.n_models = n_models
        self.model_counts = np.zeros(n_models)
        self.model_rewards = np.zeros(n_models)
        self.total_counts = 0
        self.channel_model_rewards = np.zeros((4, 5))
        
    def select_model(self, channel_rewards):
        """Select the best model based on the UCB algorithm.
        
        Args:
            channel_rewards (numpy.ndarray): An array of rewards for each model in the current channel.
            
        Returns:
            int: The index of the selected model.
        """
        ucb_values = np.zeros(self.n_models)
        for i in range(self.n_models):
            if self.model_counts[i] == 0:
                ucb_values[i] = np.inf
            else:
                average_reward = self.channel_model_rewards[:, i].sum() / self.model_counts[i]
                exploration = np.sqrt(2 * np.log(self.total_counts) / self.model_counts[i])
                ucb_values[i] = average_reward + exploration
        model = np.argmax(ucb_values)
        self.model_counts[model] += 1
        self.total_counts += 1
        reward = channel_rewards[model]
        self.model_rewards[model] += reward
        self.channel_model_rewards[:, model] += channel_rewards
        return model
    
def train_bandit(bandit, ucb, epochs):
    """Train the contextual bandit using the UCB algorithm.
    
    Args:
        bandit (ContextualBandit): The contextual bandit to train.
        ucb (UCB): The UCB algorithm to use for model selection.
        epochs (int): The number of training epochs.
        
    Returns:
        numpy.ndarray: An array of rewards for each epoch.
    """
    rewards = np.zeros(epochs)
    for i in range(epochs):
        channel = np.random.randint(bandit.n_channels)
        model = ucb.select_model(bandit.rewards[channel])
        reward = bandit.get_reward(channel, model)
        ucb.model_rewards[model] += reward
        ucb.model_counts[model] += 1
        ucb.channel_model_rewards[channel, model] += reward
        rewards[i] = reward
    return rewards

if __name__ == '__main__':
    bandit = ContextualBandit()
    ucb = UCB(bandit.n_models)
    rewards = train_bandit(bandit, ucb, 100000)
    print('Average reward:', np.mean(rewards))
