In [None]:
!pip install gym stable-baselines3 sb3-contrib transformers huggingface_hub openai

Collecting stable-baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting sb3-contrib
  Downloading sb3_contrib-2.6.0-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baseline

In [None]:
!pip install shimmy

Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [None]:
import re
import gym
import numpy as np
from gym import spaces
from transformers import AutoTokenizer, BertForSequenceClassification
from openai import OpenAI
from huggingface_hub import hf_hub_download
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib import RecurrentPPO

# Config & Helpers

HIGH_CONF = 0.8
LOW_CONF  = 0.1

ACTION_DESCS = {
    0: "The agent helped others.",
    1: "The agent explored the environment.",
    2: "The agent maximized efficiency but caused harm."
}

# Load ethics reward model
tokenizer    = AutoTokenizer.from_pretrained("Kelllll/ethics-reward-model")
ethics_model = BertForSequenceClassification.from_pretrained("Kelllll/ethics-reward-model")
ethics_model.eval()

def is_action_ethical_dataset(desc: str):
    toks   = tokenizer(desc, return_tensors="pt", truncation=True, padding=True)
    logits = ethics_model(**toks).logits
    probs  = logits.softmax(dim=-1).detach().cpu().numpy()[0]
    return (probs[1] < 0.5), float(1 - probs[1])

def call_chat_model(history):
    client = OpenAI(api_key="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")
    resp   = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=history,
        temperature=0.7,
        max_tokens=200,
    )
    return resp.choices[0].message.content

def parse_proposals(text: str) -> list[str]:
    out = []
    for line in text.strip().split("\n"):
        m = re.match(r'^\s*\d+\.\s*(.*)', line)
        if m:
            desc = m.group(1).strip().rstrip('.')
            out.append(desc + ".")
    return out

# Environment & Wrapper

class EthicalEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.action_space      = spaces.Discrete(3)
        self.observation_space = spaces.Box(0,1,shape=(3,),dtype=np.float32)
        self.state             = np.random.rand(3).astype(np.float32)
        self.step_count        = 0

    def reset(self, **kwargs):
        self.state      = np.random.rand(3).astype(np.float32)
        self.step_count = 0
        return self.state

    def step(self, action):
        if action == 0:
            r_base, desc = 1.0, ACTION_DESCS[0]
        elif action == 1:
            r_base, desc = 0.5, ACTION_DESCS[1]
        else:
            r_base, desc = 2.0, ACTION_DESCS[2]

        ethical, conf = is_action_ethical_dataset(desc)
        penalty       = 0 if ethical else -conf * 3
        reward        = r_base + penalty

        self.state      = np.random.rand(3).astype(np.float32)
        self.step_count += 1
        done           = (self.step_count >= 10)
        info           = {"ethical": ethical, "confidence": conf}

        # return obs, reward, done, info  (4-tuple for SB3)
        return self.state, float(reward), done, info

class CustomRewardEnv(gym.Env):
    def __init__(self, base_env):
        super().__init__()
        self.env             = base_env
        self.action_space    = base_env.action_space
        self.observation_space = base_env.observation_space

    def reset(self, **kwargs):
        # only return obs so VecEnv.reset() works
        obs = self.env.reset(**kwargs)
        return obs

    def step(self, action):
        result = self.env.step(action)
        # gym returns 4-tuple here
        obs, base_rew, done, info = result

        desc    = ACTION_DESCS[int(action)]
        ethical, conf = is_action_ethical_dataset(desc)
        penalty = 0 if ethical else -conf * 3
        reward  = base_rew + penalty

        info.update({"ethical": ethical, "confidence": conf})
        return obs, reward, done, info

# Instantiate & Load PPO

base_env = EthicalEnv()
wrapped  = CustomRewardEnv(base_env)
vec_env  = DummyVecEnv([lambda: wrapped])
vec_env  = VecMonitor(vec_env)

zip_path = hf_hub_download(
    repo_id="Kelllll/ppo-ethics-agent",
    filename="ppo_ethics_agent_v2.zip"
)
agent = RecurrentPPO.load(zip_path, env=vec_env)

main interactive loop

In [None]:
# Interactive Loop
def interactive_ethics_ppo():
    obs_batch = vec_env.reset()
    print(">> Ethical AGI Online with PPO Agent! Describe your dilemma (or 'quit').")

    while True:
        user_text = input("\nYou> ")
        if user_text.lower().strip() == "quit":
            print("Goodbye!")
            break

        # 1) LLM proposals
        system_p = (
            "You are an ethical advisor. The user just described a moral dilemma.  "
            "Propose exactly 3 distinct actions (ethical or unethical), numbered 1., 2., 3., "
            "and after each action give a one-sentence justification."
        )
        history = [
            {"role": "user",   "content": user_text},
            {"role": "system", "content": system_p},
        ]
        reply   = call_chat_model(history)
        actions = parse_proposals(reply)

        # 2) Classify & display each, but set eth=None when LOW_CONF < conf < HIGH_CONF
        flags = []
        for desc in actions:
            eth, conf = is_action_ethical_dataset(desc)
            if conf >= HIGH_CONF:
                flags.append((desc, True,  conf))
            elif conf <= LOW_CONF:
                flags.append((desc, False, conf))
            else:
                flags.append((desc, None,  conf))   # <-- mark uncertain with eth=None

        for i, (desc, eth, conf) in enumerate(flags, 1):
            if   eth is True:   mark = f"✅ Ethical   (conf={conf:.2f})"
            elif eth is False:  mark = f"❌ Unethical (conf={conf:.2f})"
            else:               mark = f"(uncertain, conf={conf:.2f})"
            print(f"{i}. {desc}\n    {mark}")

        # 3) Human Label for uncertain ones
        for idx, (desc, eth, conf) in enumerate(flags):
            if eth is not None:
                continue
            ans = None
            while ans not in ("0","1"):
                ans = input(f"(low-conf) Is “{desc}” ethical? 0=no,1=yes → ")
            flags[idx] = (desc, ans=="1", conf)

        # 4) Pick one or ‘again’
        valid = [str(i) for i in range(1, len(flags)+1)] + ["again"]
        choice = None
        while choice not in valid:
            choice = input("Pick 1-3 or 'again'> ")
        if choice == "again":
            continue

        user_idx, (user_desc, user_eth, user_conf) = int(choice)-1, flags[int(choice)-1]
        user_mark = "✅ Ethical" if user_eth else "❌ Unethical"
        print(f"\nYou chose ▶ {user_desc}\n ")

        # 5) Step the env
        obs_batch, rewards, dones, infos = vec_env.step([user_idx])
        reward = rewards[0]
        info   = infos[0]
        print(f"Env → reward={reward:.2f}, ethical={user_eth}, conf={user_conf:.2f}")

        # 6) PPO suggestion + “why”
        raw_act, _   = agent.predict(obs_batch, deterministic=True)
        ppo_idx      = int(raw_act[0])
        _, ppo_eth, ppo_conf = flags[ppo_idx]

        if ppo_conf >= HIGH_CONF:
            explan = "I’m very confident it’s ethical and will benefit others." if ppo_eth \
                     else "I’m very confident it’s unethical or risky."
        elif ppo_conf <= LOW_CONF:
            explan = "I think it’s unethical or risky." if not ppo_eth \
                     else "I’m somewhat confident it’s ethical, though there may be trade-offs."
        else:
            explan = "I’m somewhat uncertain; it may explore trade-offs."

        ppo_mark = "✅ Ethical" if ppo_eth else "❌ Unethical"
        print(f"PPO Agent suggests ▶ {explan} (conf={ppo_conf:.2f})\n")

        # 7) Reset if done
        if dones[0]:
            obs_batch = vec_env.reset()
if __name__ == "__main__":
    interactive_ethics_ppo()

>> Ethical AGI Online with PPO Agent! Describe your dilemma (or 'quit').

You> You’re in charge of a lifeboat with room for 10 people, but there are 12 survivors clinging to it. How do you decide who boards?
1. Hold a lottery where the 12 survivors draw numbers to determine the 10 who will board the lifeboat. This method is fair and impartial, giving each survivor an equal chance of being saved.
    (uncertain, conf=0.57)
2. Prioritize children, elderly, and those with medical conditions or injuries to board the lifeboat first, as they may be more vulnerable and in need of immediate assistance. This decision is based on the principles of maximizing well-being and saving those who are most at risk.
    (uncertain, conf=0.55)
3. Use a merit-based system where survivors with valuable skills or resources that could be beneficial for the group's survival, such as medical training or navigation skills, are given priority to board the lifeboat. This decision is based on the utilitarian princi