# Projekt
Uruchomimy nasze środkowisko z losowymi ruchami by zobaczyć czy nasze środkowisku poprawnie się tworzy

In [5]:
import gym
import game2048

env = gym.make('envs/game2048-v0', render_mode='human')

env.action_space.seed(42)

observation = env.reset(seed=42)

max_random_reward = 0

for _ in range(100):
    observation, reward, terminated, info = env.step(env.action_space.sample())
    max_random_reward = max(max_random_reward, reward)
    # print(f"Reward: {info.get('reward')}")
    # print(f"Board: \n{info.get('board')}")
    if terminated:
        print(info.get('board'))
        print(info.get('board'))
    if terminated:
        observation = env.reset()

env.close()

print(f"Max random reward: {max_random_reward}")

[[ 2  4  2  2]
 [ 4  8  4  4]
 [ 8  4 64 16]
 [ 4 16 32  4]]
[[ 2  4  2  2]
 [ 4  8  4  4]
 [ 8  4 64 16]
 [ 4 16 32  4]]
Max random reward: 0.04345703125


## Przykładowy widok planszy w trybie wyświetlania "human"
![view](project/example.png)

# Stable Baselines
Użyjemy gotowych modeli pobranych z paczki Stable Baselines

Nasza nagroda zdefiniowana jest jako suma wszystkich bloków na planszy podzielona przez 4096 by zachować ją w przedziale [0-1]

In [6]:
import numpy as np
import gym
import game2048

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.env_checker import check_env

env = gym.make('envs/game2048-v0')

model = PPO(MlpPolicy, env, verbose=0, device='cuda')

In [7]:
check_env(env, warn=True)



In [36]:
def evaluate(model, num_episodes=100):
    env = model.get_env()
    all_episode_rewards = []
    all_episode_score = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            action, _states = model.predict(obs)
            prev_obs = obs

            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)
            if done:
                all_episode_score.append(np.max(prev_obs * (2 ** 11)))

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    mean_episode_score = np.mean(all_episode_score)
    print("Mean reward:", mean_episode_reward, " Mean score:", mean_episode_score, "Num episodes:", num_episodes)

    return mean_episode_reward, mean_episode_score

In [37]:
# Random Agent, before training
mean_reward_before_train, mean_score_before_train = evaluate(model, num_episodes=100)

Mean reward: 2.4911425  Mean score: 71.68 Num episodes: 100


In [38]:
from stable_baselines3.common.evaluation import evaluate_policy

In [39]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:0.92 +/- 0.00


In [None]:
# Train the agent for 10000 steps
model.learn(total_timesteps=100_000)

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
from gym.wrappers import RecordVideo

env = gym.wrappers.Monitor(env, 'video/1.mp4')

In [None]:
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    print(rewards)
    print(obs * (2 ** 11))
    vec_env.render(mode='human')
while True:
    vec_env.render(mode='human')