In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%run environment.ipynb

import numpy as np
import random
import copy
import time
import pandas
random.seed(5)

        
class Agent:

    def __init__(self, env, gamma=0.9, alpha=0.1, epsilon=0.9, episodes=1):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.environment = env
        self.qtable = self.__initdic__() #rewards table
        self.episodes = episodes


    def __initdic__(self):
        table = dict()
        # Initialize Q table with 0 for each state-action pair
        for state in self.environment.get_states():
            table[state] = np.zeros(len(self.environment.get_possible_actions(state)))
        return table


    def run(self):
        start = time.time()
        stats = []
        for counter in range(self.episodes):
            start_time = time.time()
            done = False
            iterations = 0
            values_updates = 0
            
            while not done:
                iterations += 1
                current_state = copy.deepcopy(self.environment.get_current_state())
                if random.uniform(0,1) < self.epsilon:
                    action = self.random_action(current_state)
                else:
                    action = self.max_action(current_state)
                action_index = self.action_index(action)
                next_state, reward, done = self.step(action)
                
                if not done:
                    old_value = self.qtable[current_state][action_index]
                    next_max = np.max(self.qtable[next_state])
                    new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)

                    # Llevo un contador de la cantidad de veces en las que los q-valores cambiaron.
                    # Esto es útil para las gráficas de convergencia.
                    if self.qtable[current_state][action_index] != new_value:
                        values_updates += 1

                    self.qtable[current_state][action_index] = new_value
                
                else:
                    # Llevo un contador de la cantidad de veces en las que los q-valores cambiaron.
                    # Esto es útil para las gráficas de convergencia.
                    if self.qtable[current_state][action_index] != reward:
                        values_updates += 1

                    self.qtable[current_state][action_index] = reward

            execution_time = round((time.time() - start_time) * 1000, 6)
            stats.append([counter, iterations, values_updates, execution_time])
            
            if counter % 30 == 0:
                self.epsilon -= self.epsilon/10
            self.environment.reset()
        stats_df = pandas.DataFrame(stats, columns=['Episodio', 'Iteraciones', 'Cantidad de cambios en los q-valores', 'Tiempo de ejecución (ms)'])
        end = time.time()
        duration = end - start
        return stats_df, duration


    def random_action(self, current_state):
        return random.choice(self.environment.get_possible_actions(current_state))
    

    def step(self, action):
        old_state = copy.deepcopy(self.environment.get_current_state())
        reward, new_state, done = self.environment.do_action(action)
        next_state = copy.deepcopy(new_state)
        return next_state, reward, done
    

    def actions_values(self):
        actions = {}
        self.environment.reset()
        values = [[0 for _ in range(self.environment.ncols)] for _ in range(self.environment.nrows)]
        for state in self.environment.get_states():
            action = np.argmax(self.qtable[state])
            actions[state] = self.environment.get_possible_actions(state)[action]
            (i, j) = state
            values[i][j] = np.max(self.qtable[state])
        return actions, values
        

    def max_action(self, current_state):
        action_index = np.argmax(self.qtable[current_state]) 
        actions = self.environment.actions
        return actions[action_index]

    def action_name(self, action_index):
        return self.environment.actions[action_index]
    
    
    def action_index(self, action):
        actions = self.environment.actions
        for i in range(len(actions)):
            if actions[i] == action:
                return i
        return -1