In [2]:
import gymnasium as gym
import numpy as np

class CustomCartPoleEnv(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        self.reward_function = self.LLMRewardFunction

    def step(self, action):
        observation, _, terminated, truncated, info = self.env.step(action)
        reward = self.reward_function(observation, action)
        return observation, reward, terminated, truncated, info

   #  def angle_based_reward(self, observation, action):
   #      _, _, angle, _ = observation
   #      return np.cos(angle)  # Higher reward when angle is closer to 0 (vertical)

   #  def combined_reward(self, observation, action):
   #      x, _, angle, _ = observation
   #      return np.cos(angle) + (1 - abs(x) / 2.4)  # Combine angle and position rewards
    
    def LLMRewardFunction(self, function_string):
        local_namespace = {}
        # Execute the function string in the local namespace
        exec(function_string, globals(), local_namespace)
        
        # Find the function in the local namespace
        new_function = None
        for item in local_namespace.values():
            if callable(item):
                new_function = item
                break
        
        if new_function is None:
            raise ValueError("No function found in the provided string")
        
        # Set the new function as the reward function
        self.set_reward_function(new_function)

    def set_reward_function(self, reward_function):
        self.reward_function = reward_function

def simple_angle_policy(observation):
    _, _, angle, _ = observation
    return 1 if angle > 0 else 0

# def position_and_angle_policy(observation):
#     x, _, angle, _ = observation
#     if abs(x) > 1.0:  # If cart is far from center
#         return 0 if x > 0 else 1  # Move towards center
#     else:
#         return 1 if angle > 0 else 0  # Otherwise, balance the pole


In [17]:
import anthropic

# Create and wrap the environment
env = gym.make("CartPole-v1", render_mode="human")
env = CustomCartPoleEnv(env)

#API Query
client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key="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",
)
generatedRewardFunction = client.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1024,
    messages=[
        {"role": "user"

         , "content": """You are a python code outputter. I want your output to only be python code and just be one function. No other text with it the output. 
         This function will be a reward function, named LLMOutput(), for a RL environment that follows the description below. 
         The inputs are observation and action in that order, the input observation can be broken down as follows: x, _, angle,_ = observation :
         
         This environment is a pole balanced on a cart that can move from left to right, 
         the idea is to keep the pole as upright as possible by moving the cart either left or right, 
         the information you have available to you is the position(x) and the angle(angle)."""}
    ]
)
print(generatedRewardFunction.content[0].text)


# generatedRewardFunction = """def LLMOutput(observation, action):
#    x, _, angle, _ = observation
#    reward = 1.0 - abs(angle) - 0.5 * abs(x)
#    if abs(angle) > 0.2 or abs(x) > 2.4:
#        reward -= 10
#    return reward
# """#API call to Claude

# Set initial policy and reward function
env.LLMRewardFunction(generatedRewardFunction.content[0].text)
current_policy = simple_angle_policy  # Set the policy 


# Main loop
observation, info = env.reset(seed=42)
for _ in range(1000):
    action = current_policy(observation)
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

def LLMOutput(observation, action):
    x, _, angle, _ = observation
    
    reward = 0
    
    # Reward for keeping the pole upright
    angle_reward = 1 - abs(angle) / 0.2  # Normalize angle to [0, 1]
    reward += max(0, angle_reward)
    
    # Penalty for moving too far from the center
    position_penalty = -abs(x) / 2.4  # Normalize position to [-1, 0]
    reward += max(-1, position_penalty)
    
    # Small reward for taking action (to encourage exploration)
    reward += 0.1 if action != 0 else 0
    
    # Bonus for keeping the pole very close to vertical
    if abs(angle) < 0.05:
        reward += 0.5
    
    # Penalty for extreme angles
    if abs(angle) > 0.3:
        reward -= 1
    
    return reward
