In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install numpy pygame tqdm matplotlib ipywidgets pandas torch Pillow ipympl
%pip install minedojo --user

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


#Setting up PyTorch for GPU

In [3]:
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

torch.cuda.is_available()

True

In [4]:
import numpy as np
import itertools
from tqdm.notebook import trange, tqdm
import pandas as pd
import sys
from abc import ABC
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.pyplot as plt
from PIL import Image
import minedojo
%matplotlib widget

[INFO:minedojo.tasks] Loaded 1572 Programmatic tasks, 1558 Creative tasks, and 1 special task: "Playthrough". Totally 3131 tasks loaded.


In [5]:
minedojo.tasks.ALL_PROGRAMMATIC_TASK_IDS

['combat_shulker_end_leather_armors_wooden_sword_shield',
 'combat_shulker_end_iron_armors_wooden_sword_shield',
 'combat_shulker_end_diamond_armors_wooden_sword_shield',
 'combat_endermite_end_leather_armors_wooden_sword_shield',
 'combat_endermite_end_iron_armors_wooden_sword_shield',
 'combat_endermite_end_diamond_armors_wooden_sword_shield',
 'combat_shulker_end_leather_armors_iron_sword_shield',
 'combat_shulker_end_iron_armors_iron_sword_shield',
 'combat_shulker_end_diamond_armors_iron_sword_shield',
 'combat_endermite_end_leather_armors_iron_sword_shield',
 'combat_endermite_end_iron_armors_iron_sword_shield',
 'combat_endermite_end_diamond_armors_iron_sword_shield',
 'combat_shulker_end_leather_armors_diamond_sword_shield',
 'combat_shulker_end_iron_armors_diamond_sword_shield',
 'combat_shulker_end_diamond_armors_diamond_sword_shield',
 'combat_endermite_end_leather_armors_diamond_sword_shield',
 'combat_endermite_end_iron_armors_diamond_sword_shield',
 'combat_endermite_end_

In [6]:
IMAGE_SIZE = (160,256)
env = minedojo.make("harvest_wool_with_shears_and_sheep", image_size=(160, 256) )
env.task_prompt



In [7]:
#https://docs.minedojo.org/sections/core_api/action_space.html
act_space = env.action_space

In [8]:
obs_space = env.observation_space

In [9]:
import csv

def load_item_mapping_from_csv(file_path):
    item_mapping = {}
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row if it exists
        for row in reader:
            item_name, _, item_id, item_id_2 = row  # Change this line to read the first and third columns
            if item_id is None or item_id == '':
                item_id = 10000 + len(item_mapping)
            item_id = int(item_id)
            if item_id_2 is not None:
                item_id = item_id + 10000 * int(item_id_2)

            item_mapping[item_name] = item_id
    return item_mapping


def minimizeIds(item_mapping):
    # minimizing IDs by replacing each items id with its index in the list
    # this is done to reduce the size of the observation space
    minimized = {}
    for key, value in item_mapping.items():
        if key not in minimized:
            minimized[key] = len(minimized)
    return minimized

# ensure all numeric ids are unique
def check_item_mapping(item_mapping):
    numeric_ids = [item_id for item_id in item_mapping.values() if isinstance(item_id, int)]
    assert len(numeric_ids) == len(set(numeric_ids))

string_to_index_mapping = load_item_mapping_from_csv("minecraft_items.csv")

string_to_index_mapping = minimizeIds(string_to_index_mapping)

check_item_mapping(string_to_index_mapping)

string_to_index_mapping

{'acacia door': 0,
 'acacia fence': 1,
 'acacia fence gate': 2,
 'acacia stairs': 3,
 'activator rails': 4,
 'air': 5,
 'anvil': 6,
 'slightly damaged anvil': 7,
 'very damaged anvil': 8,
 'apple': 9,
 'armor stand': 10,
 'arrow': 11,
 'baked potato': 12,
 'black banner': 13,
 'red banner': 14,
 'green banner': 15,
 'brown banner': 16,
 'blue banner': 17,
 'purple banner': 18,
 'cyan banner': 19,
 'light gray banner': 20,
 'gray banner': 21,
 'pink banner': 22,
 'lime banner': 23,
 'yellow banner': 24,
 'light blue banner': 25,
 'magenta banner': 26,
 'orange banner': 27,
 'white banner': 28,
 'barrier': 29,
 'beacon': 30,
 'red bed': 31,
 'bedrock': 32,
 'raw beef': 33,
 'beetroot': 34,
 'beetroot seeds': 35,
 'beetroot soup': 36,
 'birch boat': 37,
 'birch door': 38,
 'birch fence': 39,
 'birch fence gate': 40,
 'birch stairs': 41,
 'black shulker box': 42,
 'blaze powder': 43,
 'blaze rod': 44,
 'blue shulker box': 45,
 'oak boat': 46,
 'bone': 47,
 'bone block': 48,
 'book': 49,
 '

In [10]:
import gymnasium as gym

def preprocess_observation(obs_dict, space_dict = obs_space, string_to_index_mapping=string_to_index_mapping):
    obs_list = []
    for key, space in space_dict.items():
        if isinstance(space, gym.spaces.Dict):
            obs_list.extend(preprocess_observation(obs_dict[key], space.spaces, string_to_index_mapping))
        elif isinstance(space, (gym.spaces.Box, gym.spaces.MultiDiscrete)):
            obs_list.extend(obs_dict[key].flatten())
        elif isinstance(space, gym.spaces.Discrete):
            obs_list.append(obs_dict[key])
        elif isinstance(space, (int, float, bool, np.number)):
            obs_list.append(space)
        elif isinstance(space, str):
            if key == 'equipment':
                equipment_items = space.split(',')
                equipment_ids = [string_to_index_mapping.get(item, -1) for item in equipment_items]
                obs_list.extend(equipment_ids)
            else:
                obs_list.append(string_to_index_mapping.get(space, -1))  # Use -1 for unknown strings
    return np.array(obs_list, dtype=np.float32)



In [11]:

def mask_multidiscrete_probs(masks, multidiscrete_tensor):
    # detach all tensors from the graph
    multidiscrete_tensor = [t.detach().cpu() for t in multidiscrete_tensor]
    # Create a list of zero-filled tensors with the same shapes as multidiscrete_tensor
    result = [torch.zeros_like(t) for t in multidiscrete_tensor]

    for i, key in enumerate(masks.keys()):
        mask = masks[key]
        for j, is_enabled in enumerate(mask) :
            if is_enabled and i < len(multidiscrete_tensor) and j < len(multidiscrete_tensor[i]):
                result[i][j] = multidiscrete_tensor[i][j]

    return result



In [12]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dims):
            super(ActorCritic, self).__init__()
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.actor_branches = nn.ModuleList().to(self.device)
            for action_dim in action_dims:
                self.actor_branches.append(nn.Sequential(
                    nn.Linear(state_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, action_dim),
                    nn.Softmax(dim=-1)
                ).to(self.device))

            self.critic = nn.Sequential(
                nn.Linear(state_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 1)
            ).to(self.device)

            self.optimizer = optim.Adam(self.parameters(), lr=0.001)


    def forward(self, state):
            action_probs = [branch(state) for branch in self.actor_branches]
            value = self.critic(state)
            return action_probs, value

    def choose_action(self, state, masks):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        dist, _ = self.forward(state)
        action_probs = dist

        # Call mask_multidiscrete_probs to mask the action_probs
        masked_action_probs = mask_multidiscrete_probs(masks, action_probs)

        # Choose actions based on the masked action probabilities
        actions = [np.argmax(masked_action_probs[i]) for i in range(len(masked_action_probs))]

        return actions



    def train(self, transitions, gamma=0.99, critic_coeff=0.5, entropy_coeff=0.01):
        states, actions_list, rewards, next_states, masks, dones = zip(*transitions)
        actions = [torch.tensor(action, dtype=torch.long).unsqueeze(1).to(self.device) for action in actions_list]  # Add an extra dimension

        # Create tensors for states and next_states
        states = [torch.tensor(state, dtype=torch.float).to(self.device) for state in states if state.size > 0]
        next_states = [torch.tensor(next_state, dtype=torch.float).to(self.device) for next_state in next_states if next_state.size > 0]

        rewards = torch.tensor(rewards, dtype=torch.float).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float).to(self.device)

        if len(states) == 0 or len(next_states) == 0:
            return

        _, next_values = self.forward(torch.stack(next_states))
        next_values = next_values.squeeze().detach()
        target_values = rewards + gamma * next_values * (1 - dones)

        action_probs, values = self.forward(torch.stack(states))
        action_log_probs = [torch.log(action_probs[i].gather(1, actions[i])).squeeze() for i in range(len(action_probs))]
        entropy = sum([(-action_probs[i] * torch.log(action_probs[i] + 1e-9)).sum(dim=1).mean() for i in range(len(action_probs))])

        action_losses = [-torch.sum(action_log_probs[i] * (target_values - values.squeeze().detach())) for i in range(len(action_log_probs))]
        value_losses = 0.5 * (target_values - values.squeeze()).pow(2)
        total_loss = sum(action_losses) + critic_coeff * value_losses.mean() - entropy_coeff * entropy

        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()








In [13]:
input_shape = preprocess_observation(env.observation_space).shape[0]
num_actions = env.action_space.nvec.tolist()
agent = ActorCritic(input_shape, num_actions)




In [None]:
# increase batch size when we have a better model
batch_size = 10
num_epochs = 10
max_episode_steps = 1000

with tqdm(total=num_epochs*batch_size*max_episode_steps) as pbar:
    for epoch in range(num_epochs):
        episode_rewards = []

        transitions = []

        for episode in range(batch_size):
            obs = env.reset()
            state = preprocess_observation(obs)
            done = False
            episode_reward = 0

            for step in range(max_episode_steps):
                masks = obs["masks"]
                action = agent.choose_action(state, masks)
                obs, reward, done, _ = env.step(action)
                base_reward = reward
                # entities, distances = obs["rays"]["entity_name"], obs["rays"]["entity_distance"]
                # sheep_idx = np.where(entities == "sheep")[0]
                #
                # if len(sheep_idx) > 0:
                #     sheep_distance = np.min(distances[sheep_idx])
                #     #encourage being closer to sheep
                #     if sheep_distance < 8:
                #         base_reward += 1 / (sheep_distance + 1)

                next_state = preprocess_observation(obs)

                episode_reward += base_reward
                transitions.append((state, action, reward, next_state, masks, done))

                if done:
                    pbar.update(max_episode_steps - step)
                    break

                state = next_state
                pbar.update(1)

            episode_rewards.append(episode_reward)

        # Train the agent using collected transitions
        agent.train(transitions)

        mean_episode_reward = np.mean(episode_rewards)
        print(f"Epoch: {epoch+1}/{num_epochs}, Mean Reward: {mean_episode_reward:.2f}")

# save the model
torch.save(agent.state_dict(), 'model.pt')



  0%|          | 0/100000 [00:00<?, ?it/s]

[INFO:minedojo.tasks] Loaded 1572 Programmatic tasks, 1558 Creative tasks, and 1 special task: "Playthrough". Totally 3131 tasks loaded.
  return F.linear(input, self.weight, self.bias)
