In [None]:
import math
import numpy as np
# import gym
import gymnasium as gym


class QLearner:
    def __init__(self):
        self.environment = gym.make('CartPole-v1', render_mode="human")
        self.attempt_no = 1
        self.upper_bounds = [
            self.environment.observation_space.high[0],
            0.5,
            self.environment.observation_space.high[2],
            math.radians(50)
        ]
        self.lower_bounds = [
            self.environment.observation_space.low[0],
            -0.5,
            self.environment.observation_space.low[2],
            -math.radians(50)
        ]
        self.knowledge_base: dict[tuple[tuple[int, int, int, int], int], float] = {}
        self.lr = 0.0
        self.df = 1.0

    def learn(self, max_attempts):
        for _ in range(max_attempts):
            reward_sum = self.attempt()
            print(reward_sum)

    def attempt(self):
        observation = self.discretise(self.environment.reset()[0])
        terminated, truncated  = False, False
        reward_sum = 0.0
        while not truncated and not terminated:
            # self.environment.render()
            action = self.pick_action(observation)
            print("action", action)
            new_observation, reward, terminated, truncated, info = self.environment.step(action)
            new_observation = self.discretise(new_observation)
            self.update_knowledge(action, observation, new_observation, reward)
            observation = new_observation
            reward_sum += reward
        self.attempt_no += 1
        return reward_sum

    def discretise(self, observation):
        return 1, 1, 1, 1

    def pick_action(self, observation):
        return self.environment.action_space.sample()

    def update_knowledge(self, action, observation, new_observation, reward):
        self.knowledge_base[observation, action] = (1.0 - self.lr) * self.knowledge_base.get((observation, action), 1) + self.lr * (reward + self.df * self.get_best_knowledge(new_observation))

    def get_best_knowledge(self, observation):
        return max(self.knowledge_base.get((observation, np.int64(0)), 1), self.knowledge_base.get((observation, np.int64(1)), 1))


def main():
    learner = QLearner()
    learner.learn(10000)


if __name__ == '__main__':
    main()


2025-03-20 12:36:14.818 python[32573:2654431] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-20 12:36:14.818 python[32573:2654431] +[IMKInputSession subclass]: chose IMKInputSession_Modern


action 1
action 0
action 0
action 0
action 1
action 1
action 1
action 0
action 0
action 0
action 0
action 0
action 1
action 0
action 0
action 1
action 0
17.0
action 1
action 0
action 1
action 0
action 1
action 0
action 1
action 0
action 1
action 0
action 0
action 0
action 1
action 1
action 1
action 1
action 1
action 0
action 1
action 0
action 1
action 1
action 0
23.0
action 1
action 0
action 0
action 0
action 1
action 0
action 1
action 0
action 0
action 1
action 1
action 0
action 1
action 0
action 1
action 0
action 1
action 1
action 1
action 1
action 1
action 1
action 1
action 0
action 1
action 1
action 1
action 0
action 0
action 0
action 0
action 1
action 0
action 1
action 1
action 0
action 0
action 0
action 0
action 1
action 0
action 0
action 1
action 1
action 0
action 1
action 1
action 0
action 0
action 0
action 0
action 0
action 1
action 1
action 1
action 1
action 1
action 0
action 0
action 0
action 0
action 1
action 0
action 1
action 1
action 0
action 1
action 1
action 0
action 0


KeyboardInterrupt: 

: 