<a href="https://colab.research.google.com/github/SergeyLimanskiy/Study/blob/main/%D0%BF%D1%80%D0%BE%D1%86%D0%B5%D1%81%D1%81_%D0%B8%D0%B3%D1%80%D1%8B_%D0%B2_%D0%BA%D0%BE%D1%81%D1%82%D0%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import random
from collections import defaultdict


class State:
    def __init__(self, points, rolls_left):
        self.points = points
        self.rolls_left = rolls_left

    def __repr__(self):
        return f'State(points={self.points}, rolls_left={self.rolls_left})'

class Action:
    ROLL = 'ROLL'
    STOP = 'STOP'

class Policy:
    def __init__(self, policy_dict=None):
        if policy_dict is None:
            policy_dict = {}
        self.policy_dict = policy_dict

    def get_action(self, state):
        return self.policy_dict.get((state.points, state.rolls_left), Action.ROLL)

    def update_policy(self, state, action):
        self.policy_dict[(state.points, state.rolls_left)] = action

class Environment:
    def __init__(self, max_rolls, reward_threshold):
        self.max_rolls = max_rolls
        self.reward_threshold = reward_threshold
        self.state = None

    def reset(self):
        self.state = State(0, self.max_rolls)
        return self.state

    def step(self, state, action):
        if action == Action.STOP:
            return state, 0, True

        new_points = state.points + sum(random.randint(1, 6) for _ in range(2))
        done = new_points >= self.reward_threshold or state.rolls_left <= 1
        next_state = State(new_points, state.rolls_left - 1)
        reward = 1 if done and new_points >= self.reward_threshold else 0
        return next_state, reward, done

def monte_carlo_learning(env, episodes=10000, gamma=0.95):
    Q = defaultdict(lambda: {Action.ROLL: 0, Action.STOP: 0})
    N = defaultdict(lambda: {Action.ROLL: 0, Action.STOP: 0})
    policy = Policy()

    for _ in range(episodes):
        episode = []
        state = env.reset()
        while True:
            action = policy.get_action(state)
            next_state, reward, done = env.step(state, action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            N[state][action] += 1
            Q[state][action] += (G - Q[state][action]) / N[state][action]
            A_star = max(Q[state], key=lambda a: Q[state][a])
            policy.update_policy(state, A_star)

    return policy

def main():
    env = Environment(max_rolls=20, reward_threshold=30)
    learned_policy = monte_carlo_learning(env)
    print("Learned Policy:")
    for (points, rolls_left), action in learned_policy.policy_dict.items():
        print(f"Points: {points}, Rolls Left: {rolls_left} => Action: {action}")

if __name__ == "__main__":
    main()

Learned Policy:
Points: 26, Rolls Left: 17 => Action: ROLL
Points: 15, Rolls Left: 18 => Action: ROLL
Points: 4, Rolls Left: 19 => Action: ROLL
Points: 0, Rolls Left: 20 => Action: ROLL
Points: 28, Rolls Left: 16 => Action: ROLL
Points: 21, Rolls Left: 17 => Action: ROLL
Points: 12, Rolls Left: 18 => Action: ROLL
Points: 5, Rolls Left: 19 => Action: ROLL
Points: 26, Rolls Left: 16 => Action: ROLL
Points: 19, Rolls Left: 17 => Action: ROLL
Points: 14, Rolls Left: 18 => Action: ROLL
Points: 6, Rolls Left: 19 => Action: ROLL
Points: 29, Rolls Left: 16 => Action: ROLL
Points: 23, Rolls Left: 17 => Action: ROLL
Points: 13, Rolls Left: 18 => Action: ROLL
Points: 26, Rolls Left: 15 => Action: ROLL
Points: 18, Rolls Left: 16 => Action: ROLL
Points: 15, Rolls Left: 17 => Action: ROLL
Points: 9, Rolls Left: 18 => Action: ROLL
Points: 18, Rolls Left: 17 => Action: ROLL
Points: 11, Rolls Left: 18 => Action: ROLL
Points: 8, Rolls Left: 19 => Action: ROLL
Points: 25, Rolls Left: 16 => Action: ROLL
P