## **Importing the libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## **Creating the plots**

In [None]:
def plot_history(history):
  rewards = history["rewards"]
  cum_rewards = history["cum_rewards"]
  chosen_arms = history["arms"]

  fig = plt.figure(figsize=[30,8])

  ax2 = fig.add_subplot(121)
  ax2.plot(cum_rewards, label="avg rewards")
  ax2.set_title("Cummulative Rewards")

  ax3 = fig.add_subplot(122)
  ax3.bar([i for i in range(len(chosen_arms))], chosen_arms, label="chosen arms")
  ax3.set_title("Chosen Actions")

## **Creating Environment**

In [None]:
class Env(object):
    
    # setting all the parameters as member variables of class
    def __init__(self, reward_probabilities, rewards):
        self.reward_probabilities = reward_probabilities    # probability of receiving reward from that arm
        self.rewards = rewards                              # acquired reward from that arm
        self.k_arms = len(rewards)                          # number of arms
        
        
    def choose_arm(self, arm):
        if np.random.random() < self.reward_probabilities[arm]:
            return self.rewards[arm]
        else:
            return 0.0

## **Instantiating the environment**

In [None]:

reward_probabilities = [0.01, 1.0, 0.75, 0.99, 0.65, 1.0];
rewards = [95.0, 0.0, 25.5, 10.05, 5.45, 2.50];
environment = Env(reward_probabilities, rewards)

print(f"K_arms \t\t\t: {environment.k_arms}")
print(f"Reward probabilities \t: {environment.reward_probabilities}")
print(f"Rewards \t\t: {environment.rewards}")

## **Performing a selective action**

In [None]:
[environment.choose_arm(2) for _ in range(10)]

## **Balancing Exploration and Exploitation with epsilon greedy algorithm**

In [None]:
class EpsilonGreedyAgent(object):
    
    # setting all the parameters as member variables of class
    def __init__(self, env, max_iterations=200, epsilon=0.1):
        self.env = env
        self.iterations = max_iterations
        self.epsilon = epsilon
    
    # method that let the agent act within the environment
    def act(self):
        q_values = np.zeros(self.env.k_arms)       # Payout of each arm is set to zero
        arm_rewards = np.zeros(self.env.k_arms)    # Total rewards of each arm is set to zero
        arm_counts = np.zeros(self.env.k_arms)     # Number of times each arm is pulled
        
        rewards = []        # list to store the actual rewards that agent makes
        cum_rewards = []    # Average of all the rewards
        
        
        for i in range(1, self.iterations+1):    # choose action using epsilon greedy algorithm
            if np.random.random() < self.epsilon:    # random action/exploration
                arm = np.random.choice(self.env.k_arms)
                
            else:                                    # greedy action/exploitation
                arm = np.argmax(q_values)            # argmax has a property that if 2 index has same Qvalue then it chooses the lower index always
                
            reward = self.env.choose_arm(arm)

            arm_rewards[arm] += reward                       # update the values
            arm_counts[arm] += 1
            q_values[arm] = arm_rewards[arm]/arm_counts[arm]

            rewards.append(reward)
            cum_rewards.append(sum(rewards)/len(rewards))    # append the values in list
    
        return {"arms": arm_counts, "rewards": rewards, "cum_rewards": cum_rewards}

## **Implementing Epsilon greedy algorithm**

In [None]:
egreedy_agent = EpsilonGreedyAgent(environment, max_iterations=2000, epsilon=0.1)    # instantiate the class
eg_history = egreedy_agent.act()                                                     # make the agent to act
print(f"TOTAL REWARD : {sum(eg_history['rewards'])}")

## **Mapping plot for Epsilon greedy algorithm**

In [None]:
plot_history(eg_history)