# Epsilon Greedy Contextual Bandit Agent

## Description:
The epsilon greedy algorithm is a popular exploration-exploitation strategy used in reinforcement learning. It is commonly used in the context of multi-armed bandit problems, where an agent needs to decide which action to take in order to maximize its cumulative reward.

## Principle:
The principle of the epsilon greedy algorithm is to balance exploration and exploitation. The agent chooses between two options: exploration, where it selects a random action to gather more information about the environment, and exploitation, where it selects the action with the highest estimated reward based on its current knowledge.


In [None]:
import numpy as np

class ContextualBandit:
    def __init__(self, num_channels, num_models, reward_probabilities):
        self.num_channels = num_channels
        self.num_models = num_models
        self.reward_probabilities = reward_probabilities
    
    def get_reward(self, channel, model):
        reward_probability = self.reward_probabilities[channel][model]
        return np.random.choice([0, 1], p=[1 - reward_probability, reward_probability])
    
    def select_action(self, channel, epsilon):
        if np.random.rand() < epsilon:
            # Explore: select a random action
            action = np.random.randint(self.num_models)
        else:
            # Exploit: select the action with the highest estimated reward
            action = np.argmax(self.reward_probabilities[channel])
        return action

class EpsilonGreedyAgent:
    def __init__(self, num_channels, num_models, epsilon):
        self.num_channels = num_channels
        self.num_models = num_models
        self.epsilon = epsilon
        self.reward_estimates = np.zeros((num_channels, num_models))
        self.action_counts = np.zeros((num_channels, num_models))
    
    def update_reward_estimate(self, channel, model, reward):
        self.action_counts[channel][model] += 1
        alpha = 1 / self.action_counts[channel][model]
        self.reward_estimates[channel][model] += alpha * (reward - self.reward_estimates[channel][model])
    
    def choose_action(self, channel):
        return np.argmax(self.reward_estimates[channel])
    
    def train(self, bandit, num_episodes):
        for episode in range(num_episodes):
            channel = np.random.randint(bandit.num_channels)
            action = self.choose_action(channel)
            reward = bandit.get_reward(channel, action)
            self.update_reward_estimate(channel, action, reward)
