In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt


class Environment:
    def __init__(self) -> None: 
        self.rewardArray   = np.random.normal(mean, stdDev, numArms)
        self.optimalAction = np.argmax(self.rewardArray)
        
        for bandit in bandits: bandit.reset()        

    
    def play(self, play: int) -> None:
        for banditIDX, bandit in enumerate(bandits):
            banditAction = bandit.choose()
            wasOptimal   = (banditAction == self.optimalAction)
         
            baseReward     = self.rewardArray[banditAction]
            gaussianReward = np.random.normal(baseReward, scale=1)
                            
            bandit.observe(gaussianReward)



class Bandit:
    def __init__(self, epsilon):
        self.epsilon      = epsilon
        self.action       = None

        self.reset()

    
    def __str__(self) -> str:
        if not self.epsilon: return "Greedy"
        return f"Epsilon = {self.epsilon}"

    
    def reset(self) -> None:
        self.rewardSum      = np.zeros(numArms)
        self.actionCount    = np.zeros(numArms)

        self.actionValueArr = np.full(numArms, self.defaultValue, dtype=np.float64)

    
    def choose(self) -> int:
        if np.random.random() > self.epsilon:
            self.action = np.argmax(self.actionValueArr)
        else:
            self.action = np.random.choice(numArms)
        
        return self.action

    
    def observe(self, reward: float) -> None:
        self.actionCount[self.action]   += 1
        self.rewardSum[self.action]     += reward

        rewardAvg = self.rewardSum[self.action] / self.actionCount[self.action]
        self.actionValueArr[self.action] = rewardAvg



if __name__ == "__main__":
    # Begin tracking the execution time
    startTime = time.time()

    # Initialize essential variables
    numArms    = 10
    mean       = 0
    stdDev     = 1
    iterations = 500
    plays      = 1000
    
    bandits = [
        Bandit(0),                     # Greedy
        Bandit(0.1),                   # Cautious
        Bandit(0.01),                  # Paranoid
    ]
    
    # Begin main learning loop
    for iteration in range(iterations):
        env = Environment()
        
        if iteration % 100 == 0:
            print(f"Iteration {iteration}/{iterations}")
        
        for play in range(plays):
            env.play(play)                               

      
################################################################
## STATS TIME! ##
    
    print(f"Execution time: {time.time()-startTime} seconds")  
