In [1]:
# read exp_num.txt, if not exist, create it with 0.
# otherwise, read the number and increase it by 1.
exp_file_path = "./utils/exp_num.txt"
try:
    with open(exp_file_path, "r") as f:
        exp_num = int(f.read()) + 1
    with open(exp_file_path, "w") as f:
        f.write(str(exp_num))
except:
    with open(exp_file_path, "w") as f:
        exp_num = 0
        f.write(str(exp_num))

# Manually for resume training(then comment it):
# exp_num = 44
print(f"exp_num: {exp_num}")

exp_num: 53


In [3]:
from typing import List
import crafter
import stable_baselines3
from stable_baselines3.common.callbacks import (
    CheckpointCallback,
    EvalCallback,
    CallbackList,
)
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import plot_results
import os
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import sys
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

from crafter_module.crafter_gymnasium import CrafterGymnasium
from utils import util_funcs
from utils.llm_skills_args import Skill
from stable_baselines3.common.vec_env import SubprocVecEnv
from nlp_module.option_policy import (
    OptionPolicy,
    OptionPolicyResponse,
    DeepseekClassifier,
)
from utils import constants
from utils.llm_skills_args import LLM_Skills_Args
import pickle


verbose = 1
# use_skills = True
use_skills = False

env_results_dir = "./results/crafter/"
if use_skills:
    exp_dir = os.path.join(env_results_dir, "with_skills", f"exp{exp_num}")
else:
    exp_dir = os.path.join(env_results_dir, "primitive", f"exp{exp_num}")
checkpoints_dir = os.path.join(exp_dir, "checkpoints")
logs_dir = os.path.join(exp_dir, "logs")
eval_dir = os.path.join(exp_dir, "eval")
monitor_dir = os.path.join(exp_dir, "monitor")
figures_dir = os.path.join(exp_dir, "figures")

os.makedirs(env_results_dir, exist_ok=True)
os.makedirs(exp_dir, exist_ok=True)
os.makedirs(checkpoints_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)
os.makedirs(eval_dir, exist_ok=True)
os.makedirs(monitor_dir, exist_ok=True)
os.makedirs(figures_dir, exist_ok=True)

0: Skill(name=Resource Gathering, description=Option skill 'Resource Gathering'. This policy directs the agent to prioritize collecting essential ...)
1: Skill(name=Basic Survival Strategy, description=Option skill 'Basic Survival Strategy'. This policy directs the agent to prioritize essential surviv...)
2: Skill(name=Combat Strategy and Enemy Management, description=Option skill 'Combat Strategy and Enemy Management'. This policy directs the agent to handle hostile...)
3: Skill(name=Efficient Crafting and Tool Progression, description=Option skill 'Efficient Crafting and Tool Progression'. This policy directs the agent to prioritize ...)


In [None]:
print_file_path = os.path.join(logs_dir, "prints.txt")
eval_print_file_path = os.path.join(logs_dir, "eval_prints.txt")

if use_skills:
    # Take skills proposed by LLM looking at trajectories of trained(1M steps) primitive agent:
    skills_descriptions_dir = "./skills_gen/skills_created_sessions/deepseekV3/from_1M_trained_primitive/option_policies_descriptions"
    # Take skills proposed by looking at trajectories of non-trained agent:
    # skills_descriptions_dir = "./skills_gen/skills_created_sessions/deepseekV3/from_non_trained_agent/option_policies_descriptions"

    # skills_descriptions_dir = "./skills_gen/option_policies_descriptions"
    skills: List[Skill] = util_funcs.load_skills(skills_descriptions_dir)
    for i, skill in enumerate(skills):
        print(f"{i}: {skill}")
    # option_policy = GeminiClassifier(verbose=verbose)
    option_policy = DeepseekClassifier(
        model="deepseek-chat", verbose=verbose, print_file_path=print_file_path
    )

    num_steps_pass_llm = 0
    default_action_index = 5
    llm_skills_args = LLM_Skills_Args(
        skills=skills,
        option_policy=option_policy,
        num_steps_pass_llm=num_steps_pass_llm,
        default_action_index=default_action_index,
    )
    mask_actions_indices = [0]  # noop

    # contain raw_env_kwargs and the other ones.
    env_kwargs = {
        "raw_env_kwargs": constants.DEFAULT_RAW_ENV_KWARGS,
        "llm_skills_args": llm_skills_args,
        "verbose": verbose,
        "print_file_path": print_file_path,
        "mask_actions_indices": mask_actions_indices,
    }
    # eval_env_kwargs is to get env kwargs and add the eval_print_file_path:
    eval_env_kwargs = {**env_kwargs, "print_file_path": eval_print_file_path}
else:
    # For primitive training without skills:
    env_kwargs = {
        "raw_env_kwargs": constants.DEFAULT_RAW_ENV_KWARGS,
        "verbose": verbose,
        "print_file_path": print_file_path,
    }
    eval_env_kwargs = {**env_kwargs, "print_file_path": eval_print_file_path}

env = CrafterGymnasium(**env_kwargs)
eval_env = CrafterGymnasium(**eval_env_kwargs)

train_monitor_filename = os.path.join(monitor_dir, "train_monitor.csv")
env = Monitor(env, filename=train_monitor_filename)
eval_monitor_filename = os.path.join(monitor_dir, "eval_monitor.csv")
eval_env = Monitor(eval_env, filename=eval_monitor_filename)

eval_env_parallel_kwargs = {**eval_env_kwargs, "verbose": 0, "print_file_path": None}

# Future TODOs:

- Maybe vectorize env to run N instances in parallel and make tranining faster.


## Resume Training:

If generally want to resume training of a model:

- choose manually the exp_num so it will load the experiment dir you want to resume.
- Change the model = ... to load your model.
- Observe the model.num_timesteps and define steps as the steps remaining.

If the training cell raises an exception and you want to continue training:

- Uncomment the cell below and execute it.
- comment the lines that redefine the model: model = sb3.PPO... cause it will reset timesteps and model params.
- Observe from the cell below the total number of steps the model has already trained. Adjust the `steps` variable to reflect the remaining number of steps you wish to train. Keep all other variables like save freq the same, the model will
- Execute the remaining cells below to continue training.
- Comment again what you uncomment


In [None]:
# from rich.console import Console

# print(f"Model trained totally: {model.num_timesteps} steps.")
# console = Console()
# console.live = None  # Reset any existing live display

## Choose new or existing model:


In [6]:
# model = stable_baselines3.PPO.load(
#     "./results/crafter/with_skills/exp44/checkpoints/checkpoint_4000_steps.zip",
#     env=env,
# )
# print(f"Model trained totally: {model.num_timesteps} steps.")

# Model setup - comment this line if training stopped and want to resume. Otherwise will restart model timesteps and params
model = stable_baselines3.PPO(
    "CnnPolicy",
    env,
    verbose=1,
    tensorboard_log=logs_dir,  # Log for TensorBoard
)
# To save a non-trained agent uncomment:
# model.save(os.path.join(exp_dir, "final_model"))

Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


## Train cell


In [None]:
# With skills:
# - did 3500 steps in 2 hours, so around 1750 steps per hour.

# Choose one option from each:

# steps = 1e6
# steps = 100_000
# steps = 10_000  # 10240
# steps = 8_000  # will train around 8192 steps
# steps = 5_000
steps = 4_000
# steps = 2014  # minimum. If less it will train around 2014
# steps = 2000

# save_freq = 50  # save every 50 steps cause LLM API might cause troubles.
save_freq = 2_000

# save_freq = 50_000

# eval_freq = save_freq * 10  # evaluate every 500 cause cost of evaluation is high.
# eval_freq = 200
eval_freq = save_freq


checkpoint_callback = CheckpointCallback(
    save_freq=save_freq,
    save_path=checkpoints_dir,
    name_prefix="checkpoint",
    verbose=2,
)
n_eval_episodes = 5  # 5 is the default value, each episode takes ~170 steps for beginner agent so take it into account.

# Not needed, we can evaluate the checkpoints ourself.
eval_callback = EvalCallback(
    eval_env,
    n_eval_episodes=n_eval_episodes,
    best_model_save_path=os.path.join(exp_dir, "best_model"),
    log_path=eval_dir,
    eval_freq=eval_freq,  # Evaluate every 10K steps
    deterministic=True,
    render=False,
)


print(f"Model trained before current training for {model.num_timesteps} timesteps")


callback = CallbackList([checkpoint_callback, eval_callback])

# Save pickle of training args in exp_dir/run_args.pkl:
save_args = {
    # "steps": steps, # dont save steps, the model.timesteps aggr steps from all trainings.
    "save_freq": save_freq,
    "eval_freq": eval_freq,
    "n_eval_episodes": n_eval_episodes,
    "use_skills": use_skills,
    # "env_kwargs": env_kwargs, # cause errors
    # "eval_env_kwargs": eval_env_kwargs,
    "print_file_path": print_file_path,
    "eval_print_file_path": eval_print_file_path,
    "verbose": verbose,
    "exp_dir": exp_dir,
    "checkpoints_dir": checkpoints_dir,
    "logs_dir": logs_dir,
    "eval_dir": eval_dir,
    "monitor_dir": monitor_dir,
    "figures_dir": figures_dir,
    "train_monitor_filename": train_monitor_filename,
    "eval_monitor_filename": eval_monitor_filename,
    # "callbacks": callback,  # cause errors.
}
if use_skills:
    # save_args["llm_skills_args"] = llm_skills_args # errors
    save_args["mask_actions_indices"] = mask_actions_indices
    save_args["skills"] = skills
    # save_args["option_policy"] = option_policy # errors
    save_args["option_policyClassname"] = option_policy.__class__.__name__
    save_args["num_steps_pass_llm"] = num_steps_pass_llm
    save_args["default_action_index"] = default_action_index

pickle.dump(save_args, open(os.path.join(exp_dir, "run_args.pkl"), "wb"))

# Training
model.learn(
    total_timesteps=steps,
    log_interval=1,  # It prints performance but not check in indep env, only during training. how many timesteps before logging to console. 1 say each episode.
    reset_num_timesteps=False,  # It's for lr schedules and other time-dep things like viz and logging purposes. Can run next times with False and will aggregate. performance acroos all runs think.
    progress_bar=True,
    callback=callback,
)
model.save(os.path.join(exp_dir, "final_model"))

In [None]:
print(
    f"Model trained totally include all trainings for {model.num_timesteps} timesteps"
)

## Plots:

- The following cells generate figures, which can be executed manually using the cell below without requiring the training process to finish. They only need the evaluation callback to run at least once and the monitor logs to be written.
- Use the cell below to manually specify the output directory for saving figures and the input files:


In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt


# For manual plotting define exp_dir:
# exp_dir = "./results/crafter/with_skills/exp44"
# monitor_dir = os.path.join(exp_dir, "monitor")
# figures_dir = os.path.join(exp_dir, "figures")
# eval_dir = os.path.join(exp_dir, "eval")

In [None]:
evals = np.load(os.path.join(eval_dir, "evaluations.npz"))

print(f"Eval timesteps: {evals['timesteps']}")
print(
    f"Eval ep_lengths: {evals['ep_lengths']}"
)  # grow with time if agent learn to survive longer. cols as number of episodes per evaluation. row for each evaluation.
results = evals["results"]
print(
    f"Eval results: {results}"
)  # rows are the evaluations. ith col is the ith episode cumulative return in each evaluation.

In [None]:
# seaborn:
import seaborn as sns

# This is typically a 2D array (num_eval_steps, num_episodes_per_eval)
ep_lengths = evals["ep_lengths"]  # Episode lengths
timesteps = evals["timesteps"]
# Compute mean and std deviation of results per evaluation step.
# First dim selects the eval index, we want to get mean inside each eval so we take mean on axis=1 which are the cols
mean_rewards = results.mean(axis=1)
std_rewards = results.std(axis=1)

# Plot mean reward over timesteps
plt.figure(figsize=(8, 5))
plt.plot(timesteps, mean_rewards, label="Mean Reward", color="blue")
plt.fill_between(
    timesteps,
    mean_rewards - std_rewards,
    mean_rewards + std_rewards,
    alpha=0.3,
    color="blue",
)
plt.xlabel("Timesteps")
plt.ylabel("Eval Mean Reward")
plt.title("Evaluation Mean Reward Over Time")
plt.legend()
plt.grid()
plt.savefig(
    os.path.join(figures_dir, "eval_mean_reward.png"), dpi=300, bbox_inches="tight"
)
plt.show()

# Display the evaluation data in a tabular format
import pandas as pd

df = pd.DataFrame(
    {
        "Timesteps": timesteps,
        "Mean Reward": mean_rewards,
        "Std Reward": std_rewards,
        "Mean Episode Length": ep_lengths.mean(axis=1),
    }
)
print(df)  # show the table

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

train_monitor_file = os.path.join(monitor_dir, "train_monitor.csv")
eval_monitor_file = os.path.join(monitor_dir, "eval_monitor.csv")


train_df = pd.read_csv(train_monitor_file, comment="#")
eval_df = pd.read_csv(eval_monitor_file, comment="#")


# Ensure x-ticks are only integers
train_episodes = np.arange(len(train_df))
eval_episodes = np.arange(len(eval_df))

# Plot Episode Rewards Over Time
plt.figure(figsize=(8, 5))
plt.plot(train_df["r"], label="Training Reward", color="blue", alpha=0.7)
plt.plot(eval_df["r"], label="Evaluation Reward", color="red", alpha=0.7)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Episode Rewards Over Time")
plt.legend()
plt.grid()


# Set x-axis ticks to integers only
plt.xticks(np.arange(0, max(len(train_df), len(eval_df)), step=1))

plt.savefig(os.path.join(figures_dir, "monitor_reward_plot.png"))
plt.show()

# Plot Episode Lengths Over Time
plt.figure(figsize=(8, 5))
plt.plot(train_df["l"], label="Training Episode Length", color="blue", alpha=0.7)
plt.plot(eval_df["l"], label="Evaluation Episode Length", color="red", alpha=0.7)
plt.xlabel("Episode")
plt.ylabel("Episode Length")
plt.title("Episode Lengths Over Time")
plt.legend()
plt.grid()

# Set x-axis ticks to integers only
plt.xticks(np.arange(0, max(len(train_df), len(eval_df)), step=1))

plt.savefig(os.path.join(figures_dir, "monitor_episode_length_plot.png"))
plt.show()

In [None]:
eval_df

## Tensorboard Instructions:

- To see 1 line per experiment, set unsmoothing in settings to 0, it removes the smoothed curve.


In [None]:
# choose the root dir to show all experiments underneeth it somewhere in the hirarchy:
tensorboard_dir = env_results_dir
# make it abs path:
tensorboard_dir = os.path.abspath(tensorboard_dir)
print(f"tensorboard_dir: {tensorboard_dir}")

In [None]:
# Open browser with the tensorboard logs:
!tensorboard --logdir {tensorboard_dir}

In [None]:
# tensorboard with manually choosing the logdir of the experiment:
%load_ext tensorboard

In [None]:
# def eval_episodes(model, env, num_episodes=5):
#     rewards = []
#     for ep_idx in range(num_episodes):
#         print("Evaluating episode:", ep_idx)
#         obs, info = env.reset()
#         done = False
#         episode_reward = 0
#         step_idx = 0
#         while not done:
#             print(f"Step: {step_idx} of episode {ep_idx}")
#             action, _ = model.predict(obs, deterministic=True)
#             obs, reward, done, truncated, info = env.step(action)
#             episode_reward += reward
#             step_idx += 1
#         rewards.append(episode_reward)
#     return rewards


# def eval_episodes_parallel(
#     model, env_ctor, num_episodes_eval_in_parallel=5, eval_env_parallel_kwargs=None
# ):
#     """
#     Evaluates the model on multiple episodes in parallel.

#     :param model: Trained RL model
#     :param env_ctor: Constructor for the environment (e.g., Dog, not an instance)
#     :param num_episodes_eval_in_parallel: Number of evaluation episodes
#     :param eval_env_parallel_kwargs: Additional kwargs for creating evaluation environments
#     :return: List of episode rewards
#     """
#     if eval_env_parallel_kwargs is None:
#         eval_env_parallel_kwargs = {}

#     # Create parallel environments
#     def make_env():
#         return env_ctor(**eval_env_parallel_kwargs)

#     envs = SubprocVecEnv([make_env for _ in range(num_episodes_eval_in_parallel)])

#     obs, info = envs.reset()
#     dones = np.array([False] * num_episodes_eval_in_parallel)
#     episode_rewards = np.zeros(num_episodes_eval_in_parallel)

#     while not np.all(dones):
#         actions, _ = model.predict(obs, deterministic=True)
#         obs, reward, done, truncated, info = envs.step(actions)
#         episode_rewards += rewards * (~dones)  # Only add reward for ongoing episodes

#     envs.close()
#     return episode_rewards.tolist()


# # model = stable_baselines3.PPO.load(os.path.join(exp_dir, "final_model"))

# # num_episodes_eval_in_parallel = 5
# # episode_rewards = eval_episodes_parallel(
# #     model, CrafterGymnasium, num_episodes_eval_in_parallel, eval_env_parallel_kwargs
# # )