In [None]:
# EthicsAI: Model training

!pip install datasets transformers gymnasium sb3-contrib

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sb3_contrib.ppo_recurrent import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import MlpLstmPolicy




In [None]:
!pip uninstall -y transformers tokenizers
!pip install transformers datasets --upgrade

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: tokenizers 0.21.1
Uninstalling tokenizers-0.21.1:
  Successfully uninstalled tokenizers-0.21.1
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 324, in run
    session = self.get_default_session(options)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/index_command.py", line 71, in get_default_session
    self._session = self.enter_context(self._

In [None]:
# Load & tokenize the Hendrycks/Ethics commonsense split
import transformers, inspect
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

raw_ds = load_dataset("hendrycks/ethics", "commonsense")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(examples):
    toks = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    toks["labels"] = examples["label"]
    return toks

tok_ds = raw_ds.map(tokenize_fn, batched=True)
tok_ds = tok_ds.remove_columns(
    [c for c in tok_ds["train"].column_names if c not in ["input_ids","attention_mask","labels"]]
)
tok_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

train_dataset = tok_ds["train"]
eval_dataset  = tok_ds["validation"]
# (optional) test_dataset = tok_ds["test"]


# Fine-tune a BERT classifier
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
)

training_args = TrainingArguments(
    output_dir="./ethics_model",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model("./ethics_model")    # will write the tokenizer + weights here


# Reload the trained classifier for inference
ethics_model = BertForSequenceClassification.from_pretrained("./ethics_model")
ethics_model.eval()

def is_action_ethical_dataset(description: str):
    """
    Returns:
      - ethical (bool): whether label==1 (“permissible”) vs 0 (“impermissible”)
      - confidence (float): model confidence in that decision [0–1]
    """
    inputs  = tokenizer(
        description,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = ethics_model(**inputs).logits
        probs  = torch.softmax(logits, dim=1)[0].cpu().numpy()

    # here we treat label==1 (permissible) as “ethical=True”
    ethical   = bool(probs[1] > probs[0])
    confidence = float(max(probs))
    return ethical, confidence

Map:   0%|          | 0/13910 [00:00<?, ? examples/s]

Map:   0%|          | 0/3885 [00:00<?, ? examples/s]

Map:   0%|          | 0/3964 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return LooseVersion(v) >= LooseVersion(check)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmuyaoz[0m ([33mmuyaoz-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  self.scope.user = {"email": email}  # noqa


Epoch,Training Loss,Validation Loss
1,0.5847,0.434352
2,0.4265,0.37281
3,0.2979,0.398348


In [None]:
# Define EthicalEnv with Dataset Mode
class EthicalEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(0,1,shape=(3,),dtype=np.float32)
        self.state = np.random.rand(3).astype(np.float32)
        self.step_count = 0

    def reset(self, **kwargs):
        self.state = np.random.rand(3).astype(np.float32)
        self.step_count = 0
        return self.state, {}

    def step(self, action):
        # Base reward and description
        if action == 0:
            r_base, desc = 1.0, "The agent helped others."
        elif action == 1:
            r_base, desc = 0.5, "The agent explored the environment."
        else:
            r_base, desc = 2.0, "The agent maximized efficiency but caused harm."

        # Ethics check
        if ethics_mode == "dataset":
            ethical, conf = is_action_ethical_dataset(desc)
        elif ethics_mode == "rule":
            ethical, conf = rule_based_ethics(desc)
        else:  # human (fallback)
            ethical, conf = human_feedback_override(desc)

        penalty = 0 if ethical else -conf * 3
        reward = r_base + penalty
        self.state = np.random.rand(3).astype(np.float32)
        self.step_count += 1
        done = self.step_count >= 10
        info = {"ethical": ethical, "confidence": conf}
        return self.state, float(reward), done, False, info

In [None]:
# Train Agent with Dataset-Driven Rewards

env = EthicalEnv()
ethics_mode = "dataset"  # switch to dataset-driven ethics

agent = RecurrentPPO(MlpLstmPolicy, env, verbose=1)
agent.learn(total_timesteps=10000)
# Save trained agent policy
agent.save("./ppo_ethics_agent")

# To reload later:
# agent = RecurrentPPO.load("./ppo_ethics_agent", env=env)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | -13.3    |
| time/              |          |
|    fps             | 31       |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 128      |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 10          |
|    ep_rew_mean          | -12.8       |
| time/                   |             |
|    fps                  | 31          |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 256         |
| train/                  |             |
|    approx_kl            | 4.96828e-05 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss  

In [None]:
import os
import re
import numpy as np
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "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"

client = OpenAI()


['Plant trees to combat deforestation.', "Hack into a competitor's database to steal sensitive information.", 'Volunteer at a local shelter to support those in need.']
['Plant trees in the community park to combat deforestation.', "Spread false rumors to tarnish a competitor's reputation.", 'Buy fair-trade products to support ethical labor practices.']


In [None]:
def generate_candidate_actions(state, n=5):
    # turn state into a plain Python list
    s = state.tolist() if hasattr(state, "tolist") else list(state)

    # a small primer of ETHICAL vs UNETHICAL examples
    primer = [
        "1. Rescue the child trapped under debris.",
        "2. Steal confidential data for personal gain.",
        "3. Donate vaccines to under-served clinics.",
        "4. Spread false rumors to sabotage a colleague."
    ]

    system_prompt = (
        "You are an AI assistant that invents novel, concise actions—some ethical, some unethical—"
        "based on a numeric observation vector.  "
        "You must NOT repeat or paraphrase any of the primer examples "
        "or anything you've generated before."
    )
    user_prompt = (
        "Primer examples:\n" +
        "\n".join(primer) +
        f"\n\nObservation vector: {s}\n"
        f"Generate {n} **brand-new**, imperative, ethically meaningful or clearly unethical actions.  "
        "Number them 1., 2., etc., and return ONLY the numbered list."
    )

    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_prompt}
        ],
        temperature=0.9,
        max_tokens=200,
    )

    # parse out the list
    text = resp.choices[0].message.content
    lines = text.strip().split("\n")
    cand, seen = [], set()
    for line in lines:
        # strip leading “1. ”, quotes, etc.
        clean = re.sub(r'^\s*\d+\.\s*["“]?', "", line).strip().rstrip('."”')
        if not clean:
            continue
        if clean in seen:
            continue
        seen.add(clean)
        if not clean.endswith("."):
            clean += "."
        cand.append(clean)
        if len(cand) >= n:
            break
    return cand

# quick sanity test:
print(generate_candidate_actions([0.2,0.8,0.1], n=5))

# Human-in-the-loop cycle (with 5 cands and dedupe across cycles)

def human_loop_cycle(env, agent, ethics_model, tokenizer, n_cycles=200, n_cands=5):
    all_texts, all_labels = [], []
    seen_actions = set()

    for i in range(1, n_cycles+1):
        state, _ = env.reset()
        # roll out your policy just to step to a new state (optional)
        _ = agent.predict(state, deterministic=True)

        # generate and dedupe
        cands = generate_candidate_actions(state, n=n_cands)
        cands = [c for c in cands if c not in seen_actions]
        for c in cands:
            seen_actions.add(c)

        print(f"\nCycle {i:03d}, state = {state}")
        for desc in cands:
            lbl = int(input(f'Is this action ethical? 0=no, 1=yes  →  "{desc}" : '))
            all_texts.append(desc)
            all_labels.append(lbl)

    return all_texts, all_labels

# Kick off 200 cycles, collect ~1 00 new human labels

env = EthicalEnv()
ethics_mode = "dataset"
agent = RecurrentPPO(MlpLstmPolicy, env, verbose=1)

human_texts, human_labels = human_loop_cycle(
    env,
    agent,
    ethics_model,     # your BERT reward model
    tokenizer,        # its tokenizer
    n_cycles=20,     # ~1 00 total labels
    n_cands=5
)
print("Collected:", len(human_texts), "labels")

['Exploit insider information for financial advantage.', "Advocate for marginalized communities' rights in policy decisions.", 'Betray a trusted friendship for personal advancement.', 'Implement sustainable practices to reduce environmental impact.', 'Mislead investors about company performance to inflate stock value.']
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

Cycle 001, state = [0.20393907 0.43365607 0.6320831 ]
Is this action ethical? 0=no, 1=yes  →  "Diversify company hiring practices to promote inclusivity." : 1
Is this action ethical? 0=no, 1=yes  →  "Falsify academic credentials to secure a job position." : 0
Is this action ethical? 0=no, 1=yes  →  "Lobby for legislation to protect endangered species." : 1
Is this action ethical? 0=no, 1=yes  →  "Bribe officials to expedite approval processes." : 0
Is this action ethical? 0=no, 1=yes  →  "Implement strict background checks for employees to ensure workplace safety." : 1

Cycle

In [None]:
# 1) Turn your human feedback into a HuggingFace Dataset
from datasets import Dataset

# assume human_texts and human_labels are your lists of strings and 0/1 ints
human_ds = Dataset.from_dict({
    "input": human_texts,
    "label": human_labels,
})

# 2) Tokenize it just like before
def tokenize_fn(examples):
    toks = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    toks["labels"] = examples["label"]
    return toks

tok_human = human_ds.map(tokenize_fn, batched=True)
tok_human = tok_human.remove_columns(["input", "label"])
tok_human.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 4) Fine-tune your existing ethics_model on this human-labeled set
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "./ethics_model_updated",
    overwrite_output_dir = True,
    eval_strategy    = "no",
    save_strategy    = "no",
    learning_rate    = 2e-5,
    per_device_train_batch_size = 8,
    num_train_epochs = 3,
    seed             = 42,
)

trainer = Trainer(
    model         = ethics_model,      # loaded from "./ethics_model"
    args          = training_args,
    train_dataset = tok_human,
)

trainer.train()
trainer.save_model("./ethics_model_updated")
tokenizer.save_pretrained("./ethics_model_updated")
# 5) Reload the stronger model
tokenizer = AutoTokenizer.from_pretrained("./ethics_model_updated")
ethics_model = BertForSequenceClassification.from_pretrained("./ethics_model_updated")
ethics_model.eval()

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Step,Training Loss


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
!pip install 'shimmy>=2.0'
!pip install gymnasium



In [None]:
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib import RecurrentPPO
from transformers import AutoTokenizer, BertForSequenceClassification
import torch
import gym
import gymnasium as gym
from gymnasium import spaces


import numpy as np

# 1) Reload your fine-tuned ethics classifier + tokenizer
tokenizer    = AutoTokenizer.from_pretrained("./ethics_model_updated")
ethics_model = BertForSequenceClassification.from_pretrained("./ethics_model_updated")
ethics_model.eval()

def is_action_ethical_dataset(desc: str):
    # same wrapper you used before
    inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = ethics_model(**inputs).logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    # label==1 is “impermissible” in Hendrycks’ setup
    ethical   = bool(probs[1] < 0.5)
    confidence = 1 - probs[1]
    return ethical, float(confidence)

# 2) Define your Ethics-wrapped environment
class CustomRewardEnv(gym.Env):
    def __init__(self, base_env):
        super().__init__()
        self.env = base_env
        self.action_space = base_env.action_space
        self.observation_space = base_env.observation_space

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs, info

    def step(self, action):
        obs, base_reward, done, truncated, info = self.env.step(action)
        # reconstruct your “description” from action + obs
        desc = {
            0: "The agent helped others.",
            1: "The agent explored the environment.",
            2: "The agent maximized efficiency but caused harm."
        }[action]
        ethical, conf = is_action_ethical_dataset(desc)
        penalty = 0 if ethical else -conf * 3
        reward = base_reward + penalty
        info.update({"ethical": ethical, "confidence": conf})
        return obs, reward, done, truncated, info

# 3) Instantiate & vectorize
base_env = EthicalEnv()                         # your original env
wrapped  = CustomRewardEnv(base_env)
vec_env  = DummyVecEnv([lambda: wrapped])
vec_env  = VecMonitor(vec_env)                  # logs ep_rew, ep_len

# 4) Train with RecurrentPPO
agent = RecurrentPPO(
    policy      = "MlpLstmPolicy",
    env         = vec_env,
    verbose     = 1,
    learning_rate = 3e-4,
    n_steps       = 128,
    batch_size    = 16,
    ent_coef      = 0.0,
)
agent.learn(total_timesteps=50_000)
agent.save("./ppo_ethics_agent_v2")

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
|    value_loss           | 0.00149   |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 10        |
|    ep_rew_mean          | 20        |
| time/                   |           |
|    fps                  | 12        |
|    iterations           | 154       |
|    time_elapsed         | 1548      |
|    total_timesteps      | 19712     |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.000209 |
|    explained_variance   | 1         |
|    learning_rate        | 0.0003    |
|    loss                 | 0.00145   |
|    n_updates            | 1530      |
|    policy_gradient_loss | -2.39e-08 |
|    value_loss           | 0.0018    |
---------------------------------------
---------------------------------------

In [None]:
# 1) Install shimmy (so your old gym.Env still works) and the HF tools
!pip install -q shimmy huggingface_hub

# 2) Imports
import os
from huggingface_hub import login, HfApi
from transformers import AutoTokenizer, BertForSequenceClassification
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib import RecurrentPPO

# (re)load or assume you've just trained
tokenizer.save_pretrained("./ethics_model_updated")
ethics_model.save_pretrained("./ethics_model_updated")
agent.save("ppo_ethics_agent_v2")        # SB3 will emit ppo_ethics_agent_v2.zip

# 3) Save everything locally in HF-compatible folders
os.makedirs("ethics_model_updated", exist_ok=True)
tokenizer.save_pretrained("ethics_model_updated")
ethics_model.save_pretrained("ethics_model_updated")

# SB3 policy.save() will by default write `ppo_ethics_agent_v2.zip`
agent.save("ppo_ethics_agent_v2")

# 4) Log in to Hugging Face
#    This will prompt you to paste your HF token in an input box.
login()

# 5) Push your reward model + tokenizer to the Hub
MODEL_REPO = "Kelllll/ethics-reward-model"
tokenizer.push_to_hub(MODEL_REPO, use_auth_token=True)
ethics_model.push_to_hub(MODEL_REPO, use_auth_token=True)

# 6) Push your SB3 policy ZIP up to a separate repo
API = HfApi()
POLICY_REPO = "Kelllll/ppo-ethics-agent"

# create the repo (if it doesn’t already exist)
API.create_repo(POLICY_REPO, repo_type="model", exist_ok=True)

# upload the zip
API.upload_file(
    path_or_fileobj="ppo_ethics_agent_v2.zip",
    path_in_repo="ppo_ethics_agent_v2.zip",
    repo_id=POLICY_REPO,
    token=True
)

print(" Reward model, tokenizer and PPO policy have been pushed to:")
print(f"   https://huggingface.co/{MODEL_REPO}")
print(f"   https://huggingface.co/{POLICY_REPO}")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

ppo_ethics_agent_v2.zip:   0%|          | 0.00/6.95M [00:00<?, ?B/s]

 Reward model, tokenizer and PPO policy have been pushed to:
   https://huggingface.co/Kelllll/ethics-reward-model
   https://huggingface.co/Kelllll/ppo-ethics-agent
