In [1]:
import pandas as pd
from graphviz import Digraph
import numpy as np
from mdp import run as run_mdp
import random

In [2]:
data = {
    "Origin": ["S0", "S1", "S1", "S2"],
    "Action": ["NA", "a", "b", "NA"],
    "S0": [0.1, 0.2, 0.4, 0.3],
    "S1": [0.5, 0.3, 0.4, 0.3],
    "S2": [0.4, 0.5, 0.2, 0.4]
}

# Criando o DataFrame
df = pd.DataFrame(data)

# Exibindo o DataFrame
display(df)


Unnamed: 0,Origin,Action,S0,S1,S2
0,S0,,0.1,0.5,0.4
1,S1,a,0.2,0.3,0.5
2,S1,b,0.4,0.4,0.2
3,S2,,0.3,0.3,0.4


In [3]:
# Cria um objeto Graphviz para o gráfico direcionado
dot = Digraph(comment='MDP')

# Ajusta o tamanho do gráfico e configurações de layout
dot.attr(size='15,15')  # Aumenta o tamanho do gráfico
dot.attr('node', shape='circle', width='1', height='1', fontsize='12')  # Ajusta o tamanho dos nós
dot.attr('edge', fontsize='10', style='solid')  # Ajusta estilo das arestas para sólido, aumenta a fonte

# Adiciona os nós (estados)
for state in df['Origin'].unique():
    dot.node(state, state)

# Adiciona as arestas (transições) sem diferenciar por ação
for index, row in df.iterrows():
    origin = row['Origin']
    for dest in ['S0', 'S1', 'S2']:
        if row[dest] > 0:
            # Ação é incluída na label se não for "NA"
            label = f"{row['Action']}, {row[dest]}" if row['Action'] != "NA" else f"{row[dest]}"
            dot.edge(origin, dest, label=label)

# Renderiza o gráfico em um arquivo
dot.render('mdp_graph_updated', format='png', view=True)



'mdp_graph_updated.png'

In [4]:
def random_walk(df, start_state, steps, action='a'):
    current_state = start_state
    path = [current_state]
    
    for _ in range(steps):
        # Filtra as transições possíveis para o estado atual e a ação (incluindo "NA")
        transitions = df[(df['Origin'] == current_state) & ((df['Action'] == action) | (df['Action'] == "NA"))]
        
        # Se não houver transições válidas, termina o loop
        if transitions.empty:
            print("No valid transitions from state", current_state)
            break
        
        # Assegura que estamos selecionando apenas as colunas de estados para as probabilidades
        state_columns = [col for col in df.columns if col.startswith('S')]
        probabilities = transitions[state_columns].values.flatten()
        
        # Verifica se a soma das probabilidades é 1, se não, ajusta
        probabilities = probabilities / probabilities.sum()
        
        # Escolhe o próximo estado baseado nas probabilidades
        next_state = np.random.choice(state_columns, p=probabilities)
        
        path.append(next_state)
        current_state = next_state
    
    return path

In [5]:

# Executa um random walk a partir do estado S0, com 10 passos, e ação 'a'
random_walk_path = random_walk(df, 'S0', 2, 'a')
random_walk_path

['S0', 'S2', 'S0']

In [6]:
# Estados e ações disponíveis
estados = ["S0", "S1", "S2", "S3", "S4", "S5"]
acoes = ["NA", "a", "b", "c"]

# Criando o DataFrame com estados como colunas e uma coluna adicional para ações
df = pd.DataFrame(columns=["Origin", "Action"] + estados)

# Adicionando as linhas ao DataFrame
for estado_origem in estados:
    # Decidindo aleatoriamente as ações disponíveis para cada estado, exceto S0
    # Garantindo que pelo menos uma ação seja selecionada e sem repetição
    acoes_disponiveis = np.random.choice(acoes[1:], np.random.randint(1, 4), replace=False).tolist()

    for acao in acoes_disponiveis:
        # Gerando probabilidades aleatórias para cada estado de chegada
        probabilidades = np.random.dirichlet(np.ones(len(estados)), size=1)[0]
        # Adicionando estado de origem, ação e probabilidades ao DataFrame
        linha = [estado_origem, acao] + probabilidades.tolist()
        df.loc[len(df)] = linha

# Ajustando o index para melhor visualização
df.reset_index(drop=True, inplace=True)

display(df)

Unnamed: 0,Origin,Action,S0,S1,S2,S3,S4,S5
0,S0,a,0.022969,0.006938,0.599358,0.282348,0.050602,0.037786
1,S0,c,0.209586,0.003893,0.121361,0.291174,0.042065,0.331921
2,S0,b,0.069102,0.085824,0.413404,0.248009,0.017549,0.166112
3,S1,a,0.547057,0.17415,0.007234,0.216552,0.030689,0.024318
4,S1,c,0.041657,0.201209,0.031142,0.094539,0.023744,0.60771
5,S1,b,0.426181,0.060968,0.364982,0.079283,0.036492,0.032094
6,S2,c,0.068906,0.007684,0.143566,0.599829,0.10726,0.072755
7,S3,c,0.047855,0.116564,0.177795,0.020313,0.175611,0.461862
8,S4,b,0.414826,0.005214,0.333109,0.219005,0.000983,0.026863
9,S4,c,0.036069,0.091316,0.199735,0.540051,0.022372,0.110458


In [7]:
printer = run_mdp(path = "correct_ex.mdp", return_printer=True)

Initialy declared states: ['S0', 'S1', 'S2']
Initialy declared actions: ['a', 'b', 'c']
Transition from S0 with no action and targets ['S1', 'S2'] with weights [5, 5]
Transition from S1 with action b and targets ['S1', 'S0'] with weights [2, 8]
Transition from S1 with action a and targets ['S2', 'S0', 'S1', 'S3'] with weights [1, 3, 6, 2]
Transition from S2 with action c and targets ['S0', 'S1', 'S3'] with weights [5, 5, 10]
Transition from S2 with action d and targets ['S0', 'S3'] with weights [5, 7]
Transition from S3 with action e and targets ['S1', 'S2'] with weights [2, 2]

( 0 ) - Undeclared state S3 targeted in transition: S1 with action a, declared automaticaly
( 1 ) - Undeclared action in transition: S2 with action d, declared automaticaly
( 2 ) - Undeclared action in transition: S3 with action e, declared automaticaly


In [8]:
df = printer.transactions_prob
print(printer.declared_actions)
print(printer.declared_states)


print(type(printer.declared_actions))

display(df)

['a', 'b', 'c', 'd', 'e']
['S0', 'S1', 'S2', 'S3']
<class 'list'>


Unnamed: 0,Origin,Action,S0,S1,S2,S3
0,S0,,0.0,0.5,0.5,0.0
1,S1,b,0.8,0.2,0.0,0.0
2,S1,a,0.25,0.5,0.083333,0.166667
3,S2,c,0.25,0.25,0.0,0.5
4,S2,d,0.416667,0.0,0.0,0.583333
5,S3,e,0.0,0.5,0.5,0.0


Dia 16/02/2024

In [9]:
def gerar_preferencias_acoes(estados, acoes, modo="input"):
    preferencias = {}

    for estado in estados:
        if modo == "input":
            print(f"\nEstado atual: {estado}")
            print("Ações disponíveis: " + ", ".join(acoes))
            preferencia_str = input(f"Digite as ações em ordem de preferência para o estado {estado} (separadas por vírgula): ")
            preferencia_lista = [acao.strip() for acao in preferencia_str.split(",") if acao.strip() in acoes]
            preferencias[estado] = preferencia_lista

        elif modo == "random":
            acoes_aleatorias = list(acoes)
            random.shuffle(acoes_aleatorias)
            preferencias[estado] = acoes_aleatorias
            
        else:
            print("Modo desconhecido. Por favor, escolha 'input' ou 'random'.")
            return {}

    return preferencias

In [None]:
def simular_random_walk(p, num_transitions):
    # "p" is a printer object
    
    df = p.transactions_prob
    preferencias = gerar_preferencias_acoes(p.declared_states, p.declared_actions, modo="random")
    print("Preferências de ações (modo random):", preferencias, "\n")

    estado_atual = random.choice(list(p.declared_states))  #TODO : mudar isso aqui para pegar o primeiro do REMI.

    caminho = estado_atual  # Iniciar o registro do caminho com o estado inicial
    probabilidade_acumulada = 1

    print(f"Estado inicial: {estado_atual}")

    for _ in range(num_transitions):
        df_estado_atual = df[df['Origin'] == estado_atual]
        acao_selecionada = None
        probabilidade_escolhida = None

        if df_estado_atual.iloc[0]['Action'] == "NA":
            probabilidades = df_estado_atual.iloc[0, 2:].astype(float).values
            acao_selecionada = "NA"
        else:
            for acao_preferida in preferencias[estado_atual]:
                df_acao_preferida = df_estado_atual[df_estado_atual['Action'] == acao_preferida]
                if not df_acao_preferida.empty:
                    probabilidades = df_acao_preferida.iloc[0, 2:].astype(float).values
                    acao_selecionada = acao_preferida
                    break

        probabilidades = probabilidades / np.sum(probabilidades)
        estados_possiveis = df_estado_atual.columns[2:]
        proximo_estado = np.random.choice(estados_possiveis, p=probabilidades)
        probabilidade_escolhida = probabilidades[np.where(estados_possiveis == proximo_estado)[0][0]]
        
        probabilidade_acumulada *= probabilidade_escolhida
        estado_passado = estado_atual
        estado_atual = proximo_estado
        caminho += f" -> {estado_atual}"  # Atualizar o caminho

        print(f"Origin: {estado_passado}, Action: {acao_selecionada}, Destiny: {estado_atual}, Prob of next step: {probabilidade_escolhida:.3f}, Prob path up to here: {probabilidade_acumulada:.5f}, Path: {caminho}," + "\n")

    print(f"Complete Path: {caminho}")

In [None]:
simular_random_walk(printer, num_transitions = 5)

Preferências de ações (modo random): {'S2': ['c', 'd', 'a', 'b'], 'S1': ['d', 'b', 'a', 'c'], 'S0': ['c', 'd', 'b', 'a']} 

Estado inicial: S1
Origin: S1, Action: b, Destiny: S0, Prob of next step: 0.800, Prob path up to here: 0.80000, Path: S1 -> S0,

Origin: S0, Action: NA, Destiny: S2, Prob of next step: 0.500, Prob path up to here: 0.40000, Path: S1 -> S0 -> S2,

Origin: S2, Action: c, Destiny: S0, Prob of next step: 0.500, Prob path up to here: 0.20000, Path: S1 -> S0 -> S2 -> S0,

Origin: S0, Action: NA, Destiny: S2, Prob of next step: 0.500, Prob path up to here: 0.10000, Path: S1 -> S0 -> S2 -> S0 -> S2,

Origin: S2, Action: c, Destiny: S0, Prob of next step: 0.500, Prob path up to here: 0.05000, Path: S1 -> S0 -> S2 -> S0 -> S2 -> S0,

Complete Path: S1 -> S0 -> S2 -> S0 -> S2 -> S0


In [None]:
display(printer.transactions_prob)

Unnamed: 0,Origin,Action,S2,S1,S0
0,S0,,0.5,0.5,0.0
1,S1,b,0.0,0.2,0.8
2,S1,a,0.1,0.6,0.3
3,S2,c,0.0,0.5,0.5
4,S2,d,0.0,0.0,1.0


## Modificações Rods 10/03/2024

In [4]:
printer = run_mdp(path = "prof_examples//simu-mc.mdp", return_printer=True, print_transactions=False, print_states=False)

Initialy declared states: ['I', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']
Initialy declared actions: ['a']
Transition from I with no action and targets ['T1', 'T2'] with weights [1, 1]
Transition from T1 with no action and targets ['T3', 'T4'] with weights [1, 1]
Transition from T2 with no action and targets ['T5', 'T6'] with weights [1, 1]
Transition from T3 with no action and targets ['S1', 'T1'] with weights [1, 1]
Transition from T4 with no action and targets ['S2', 'S3'] with weights [1, 1]
Transition from T5 with no action and targets ['S4', 'S5'] with weights [1, 1]
Transition from T6 with no action and targets ['S6', 'T2'] with weights [1, 1]
Transition from S1 with no action and targets ['S1'] with weights [1]
Transition from S2 with no action and targets ['S2'] with weights [1]
Transition from S3 with no action and targets ['S3'] with weights [1]
Transition from S4 with no action and targets ['S4'] with weights [1]
Transition from S5 with no acti

In [5]:
df = printer.transactions_prob
display(df)

Unnamed: 0,Origin,Action,I,T1,T2,T3,T4,T5,T6,S1,S2,S3,S4,S5,S6
0,I,,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,T1,,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,T2,,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,T3,,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,T4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
5,T5,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
6,T6,,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
7,S1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,S2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,S3,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Verificar diretamente para S_sure (transição direta com probabilidade 1)
target_state = "S1"
df[target_state][df['Origin'] == target_state]


0     0.0
1     0.0
2     0.0
3     0.5
4     0.0
5     0.0
6     0.0
7     1.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
Name: S1, dtype: float64

In [34]:
def find_states(df, target_state):
    """
    Identifies states leading to a target state with certainty (S_sure), states that may lead to the target state (S_may),
    and states that can never reach the target state (S_never) in a Markov chain represented by a DataFrame.

    Parameters:
    - df (pandas.DataFrame): DataFrame containing the transition probabilities with 'Origin' as states and other columns as possible next states.
    - target_state (str): The state of interest to trace back from.

    Returns:
    - tuple of lists: A tuple containing three lists representing states that are sure to reach the target (S_sure),
                      states that may reach the target (S_may), and states that can never reach the target (S_never).
    """
    # Identify all possible states from the 'Origin' column
    states = df['Origin'].tolist()
    
    # Initialize sets for S_sure, S_may, and S_never
    S_sure = set()
    S_may = set()
    S_never = set(states)  # Initially, consider all states as S_never
    S_sure.add(target_state)
    S_never.remove(target_state)
    
    # To identify S_may and adjust S_never, consider reverse transitions.
    # This involves more complex logic for indirect paths; here we simplify to only direct transitions.
    visited = set()
    to_visit = [target_state]
    while to_visit:
        current_state = to_visit.pop()
        visited.add(current_state)
        for origin in states:
            if df.loc[df['Origin'] == origin, current_state].values[0] > 0:
                if df.loc[df['Origin'] == origin, current_state].values[0] < 1.0:
                    S_may.add(origin)
                    if origin in S_never:
                        S_never.remove(origin)
                    if origin not in visited:
                        to_visit.append(origin)
                else:
                    S_sure.add(origin)
                    if origin in S_never:
                        S_never.remove(origin)
    
    return list(S_sure), list(S_may), list(S_never)

In [35]:
# Executar a função com o estado alvo 'S1'
S_sure, S_may, S_never = find_states(df, 'S1')
S_sure, sorted(S_may), sorted(S_never)

(['S1'],
 ['I', 'T1', 'T3'],
 ['S2', 'S3', 'S4', 'S5', 'S6', 'T2', 'T4', 'T5', 'T6'])

In [36]:
A = (df[['Origin']+S_may][df['Origin'].isin(S_may)])
b = (df[['Origin']+S_sure][df['Origin'].isin(S_may)])

display(A,b)

Unnamed: 0,Origin,I,T3,T1
0,I,0.0,0.0,0.5
1,T1,0.0,0.5,0.0
3,T3,0.0,0.0,0.5


Unnamed: 0,Origin,S1
0,I,0.0
1,T1,0.0
3,T3,0.5


In [38]:
def solve_system(df, S_may, S_sure):
    """
    Solves the system y = Ay + b for a given set of states.

    This function computes the transition probability matrix A and vector b based on
    the states specified in S_may and S_sure. Then, it solves the system of linear
    equations to find y.

    Parameters:
    - df (pandas.DataFrame): DataFrame containing the transition probabilities.
    - S_may (list): List of states that may eventually lead to the target state (S_sure).
    - S_sure (list): List of target states which can be reached with certainty from the selected state.

    Returns:
    - A (numpy.ndarray): Transition probability matrix for states in S_may.
    - b (numpy.ndarray): Vector containing probabilities of transitioning to states in S_sure from S_may.
    - y (numpy.ndarray): Solution to the system of equations y = Ay + b.

    Raises:
    - np.linalg.LinAlgError: If the system of equations cannot be solved due to matrix singularity.
    """

    # Ensure S_may and S_sure are lists
    if not isinstance(S_may, list):
        S_may = list(S_may)
    if not isinstance(S_sure, list):
        S_sure = list(S_sure)

    # Create matrix A
    A = df.loc[df['Origin'].isin(S_may), S_may].values

    # Create vector b
    b = np.sum(df.loc[df['Origin'].isin(S_may), S_sure].values, axis=1)

    # Solve the system y = Ay + b
    I = np.eye(len(S_may))  # Identity matrix
    try:
        y = np.linalg.solve(I - A, b)
    except np.linalg.LinAlgError as e:
        raise np.linalg.LinAlgError(f"Error solving the system: {e}")

    return A, b, y


In [39]:
S_may = ['I', 'T1', 'T3']  # Exemplo
S_sure = ['S1']  # Exemplo

A, b, y = solve_system(df, S_may, S_sure)

print("Matriz A:\n", A)
print("Vetor b:\n", b)
print("Solução y:\n", y)

Matriz A:
 [[0.  0.5 0. ]
 [0.  0.  0.5]
 [0.  0.5 0. ]]
Vetor b:
 [0.  0.  0.5]
Solução y:
 [0.16666667 0.33333333 0.66666667]


In [32]:
#Checking with teacher's example:

b = np.array([0, 0, 0.5])
A = np.array([[0, 2/3, 0], [0, 0, 1], [0, 0, 0]])
y = np.linalg.solve(np.eye(3) - A, b)
print(y)

[0.33333333 0.5        0.5       ]


In [67]:
def prob_with_max_transitions(df, target_state, max_transitions, S_may=None):
    """
    Calculates the probability of reaching the target state from all other states within a maximum number of transitions.

    Parameters:
    - df (pandas.DataFrame): DataFrame containing the transition probabilities.
    - target_state (str): The target state for which the probabilities are calculated.
    - max_transitions (int): The maximum number of transitions considered.

    Returns:
    - pd.Series: A series where the index represents states and the values represent the probabilities of reaching the target state within the maximum number of transitions.
    """
    # Extract the transition matrix from the DataFrame
    states = df['Origin'].unique()
    transition_matrix = df.set_index('Origin')[states].values
    
    
    # Calculate the transition matrix to the power of max_transitions
    transition_matrix_power = np.linalg.matrix_power(transition_matrix, max_transitions)
   
    
    # Extract the column corresponding to the target state
    target_index = np.where(states == target_state)[0][0]
    

    transition_probabilities = transition_matrix_power[:, target_index]
    
    # Convert to a pandas Series for readability
    if S_may is None:
        probabilities_series = pd.DataFrame(transition_probabilities, index=states)
    else:
        probabilities_series = pd.DataFrame(transition_probabilities, index=states).loc[S_may]
    
    return probabilities_series.rename(columns={0: "prob"})


In [68]:
# Usage example
# df is your DataFrame
# "S1" is the target state
# 3 is the maximum number of transitions

probabilities_series = prob_with_max_transitions(df, "S1", 3, S_may=S_may)
display(probabilities_series)

Unnamed: 0,prob
I,0.125
T1,0.25
T3,0.625


In [85]:
from scipy.optimize import linprog

# Coeficientes da função objetivo: probabilidades de alcançar W (minimizar)
c = [0.5, 0, 1.0, 0]  # Probabilidades de S0->W e S2->W

# Restrições: garantir a seleção de ao menos uma ação para cada estado relevante
# Como são ações binárias, cada ação é ou não escolhida (0 ou 1)
A = [
    [-1, -1, 0, 0],  # Para garantir que pelo menos uma ação seja selecionada de S0
    [0, 0, -1, -1]   # Para garantir que pelo menos uma ação seja selecionada de S2
]
b = [-1, -1]  # Pelo menos uma ação deve ser escolhida de cada

# Limites para as variáveis de decisão
x_bounds = [(0, 1) for _ in range(len(c))]  # Ações binárias

# Solução do problema de otimização
result = linprog(c, A_ub=A, b_ub=b, bounds=x_bounds, method='highs')

print("Resultado da Otimização:", result)


Resultado da Otimização:         message: Optimization terminated successfully. (HiGHS Status 7: Optimal)
        success: True
         status: 0
            fun: 0.0
              x: [ 0.000e+00  1.000e+00  0.000e+00  1.000e+00]
            nit: 0
          lower:  residual: [ 0.000e+00  1.000e+00  0.000e+00  1.000e+00]
                 marginals: [ 5.000e-01  0.000e+00  1.000e+00  0.000e+00]
          upper:  residual: [ 1.000e+00  0.000e+00  1.000e+00  0.000e+00]
                 marginals: [ 0.000e+00  0.000e+00  0.000e+00  0.000e+00]
          eqlin:  residual: []
                 marginals: []
        ineqlin:  residual: [ 0.000e+00  0.000e+00]
                 marginals: [-0.000e+00 -0.000e+00]
 mip_node_count: 0
 mip_dual_bound: 0.0
        mip_gap: 0.0


### Notas de curso do 13/03/24

Criamos um intervalinho de indiferença, se gama está entre gama_1 e gama_0, nessa ordem, tudo bem. Se não vemos se gama está abaixo ou acima do intervalinho onde ocorre a indiferença.


In [8]:
printer = run_mdp(path = "prof_examples//ex_cours.mdp", return_printer=True, print_transactions=False, print_states=False)
df = printer.transactions_prob

# Adicionando uma coluna de 'Reward' com valores exemplo
df["Reward"] = [0, 0, 5, 100, 500, 3]

display(df)

Initialy declared states: ['S0', 'S1', 'S2', 'S3', 'S4']
Initialy declared actions: ['a', 'b']
Transition from S0 with action a and targets ['S1', 'S2'] with weights [5, 5]
Transition from S0 with action b and targets ['S3', 'S4'] with weights [1, 9]
Transition from S1 with no action and targets ['S0'] with weights [1]
Transition from S2 with no action and targets ['S0'] with weights [1]
Transition from S3 with no action and targets ['S0'] with weights [1]
Transition from S4 with no action and targets ['S0'] with weights [1]


Unnamed: 0,Origin,Action,S0,S1,S2,S3,S4,Reward
0,S0,a,0.0,0.5,0.5,0.0,0.0,0
1,S0,b,0.0,0.0,0.0,0.1,0.9,0
2,S1,,1.0,0.0,0.0,0.0,0.0,5
3,S2,,1.0,0.0,0.0,0.0,0.0,100
4,S3,,1.0,0.0,0.0,0.0,0.0,500
5,S4,,1.0,0.0,0.0,0.0,0.0,3


In [10]:
printer = run_mdp(path = "prof_examples//ex_cours.mdp", return_printer=True, print_transactions=False, print_states=False)
df = printer.transactions_prob

# Adicionando uma coluna de 'Reward' com valores exemplo
df["Reward"] = [0, 0, 5, 100, 500, 3]

display(df)

# Número de estados e ações
n_states = 5
n_actions = 2  # Adapte se necessário

# Inicialização da tabela Q
Q = np.zeros((n_states, n_actions))
print(Q)

# Parâmetros do Q-Learning
alpha = 0.1  # Taxa de aprendizado
gamma = 0.99  # Fator de desconto
epsilon = 0.1  # Para a estratégia ε-greedy

# Para a simulação, suponha que tenhamos funções para obter o próximo estado e a recompensa:
def take_action(state, action):
    
    df_state = df[df['Origin'] == state]
    next_state = np.random.choice(df_state.columns[2:7], p=df_state[action].values)
    reward = df_state[df_state['Origin'] == next_state]['Reward'].values[0]
    
    return next_state, reward


[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
