<a href="https://colab.research.google.com/github/RizanSM/zero_shot_llms_in_HIL_RL/blob/main/02_reacher_env/03_BIASED_HF_D_AGG/01_Policy_Training_BIASED_HF_D_AGG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium[mujoco] mujoco stable-baselines3 -q

In [None]:
# Import the necessary libraries
import gymnasium as gym
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from google.colab import data_table
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Load the dataframe back from the pickle file
trajectory_df = pd.read_pickle('/content/drive/MyDrive/data3_rp1/2_trajectories/0_initial_training/0_initial_trajectory_reacher_df.pkl')      # Update directory location 1

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df)

In [None]:
# Check the data type of each column
print(type(trajectory_df['Episode'][0]))
print(type(trajectory_df['Timestep'][0]))
print(type(trajectory_df['State'][0]))
print(type(trajectory_df['Action'][0]))
print(type(trajectory_df['Reward'][0]))
print(type(trajectory_df['Next State'][0]))

In [None]:
trajectory_df.dtypes

HUMAN FEEDBACK IMPLEMENTATION (IDEAL CASE SCENARIO)



In [None]:
def calculate_different_rewards(trajectory_df):
    """Calculates effort-based reward and returns updated dataframe."""
    df = trajectory_df.copy()

    df['Distance Reward'] = -df['State'].apply(lambda x: np.linalg.norm(np.array(x)[8:10], axis=0))

    df['Smoothness Reward'] = -np.abs(df['State'].apply(lambda x: np.array(x)[6:8]).apply(np.linalg.norm))

    # df['Effort Reward'] = -df['Action'].apply(lambda x: np.linalg.norm(np.array(x)**2, axis=0))
    df['Effort Reward'] = -df['Action'].apply(lambda x: np.linalg.norm(np.array(x), axis=0))
    return df

In [None]:
different_rewards_df = calculate_different_rewards(trajectory_df)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(different_rewards_df)

In [None]:
def compute_weighted_rewards(df):
    # Step 1: Create a copy of the dataframe
    df = df.copy()

    # Step 2: Compute min and max values for Smoothness Reward and Effort Reward
    min_smoothness = df["Smoothness Reward"].min()
    max_smoothness = df["Smoothness Reward"].max()

    min_effort = df["Effort Reward"].min()
    max_effort = df["Effort Reward"].max()

    # Step 3: Compute Smoothness Score Range and Effort Score Range
    xsmooth = 1 / abs(max_smoothness - min_smoothness)
    xeffort = 1 / abs(max_effort - min_effort)

    print(f"Smoothness Score Range: {xsmooth}")
    print(f"Effort Score Range: {xeffort}")

    # Step 4: Compute Smoothness Weight (λ) and Effort Weight (ε) using Softmax
    exp_xsmooth = np.exp(xsmooth)
    exp_xeffort = np.exp(xeffort)

    lambda_smooth = exp_xsmooth / (exp_xsmooth + exp_xeffort)
    epsilon_effort = exp_xeffort / (exp_xsmooth + exp_xeffort)

    # Adjust weights based on bias type
    # bias_type == "aggressive":
    lambda_smooth *= 0  # Reduce smoothness importance                          # bias value 1
    epsilon_effort *= 2  # Increase effort tolerance                            # bias value 2

    print(f"Smoothness Weight (λ): {lambda_smooth}")
    print(f"Effort Weight (ε): {epsilon_effort}")

    # Step 5: Compute Smoothness Weighted Reward (R1) and Effort Weighted Reward (R2)
    df["Smoothness Weighted Reward"] = -df["Distance Reward"] - lambda_smooth * df["Smoothness Reward"]
    df["Effort Weighted Reward"] = -df["Distance Reward"] - epsilon_effort * df["Effort Reward"]

    return df

In [None]:
reward_df = compute_weighted_rewards(different_rewards_df)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(reward_df)

In [None]:
# Function to recalibrate the rewards
def recalibrate_rewards(df):
    # Create a copy of the dataframe
    df_copy = df.copy()

    alpha = -1
    # Create the 'Recalibrated_rewards' column
    df_copy['Recalibrated Reward'] = df_copy['Reward'] + alpha * (df_copy['Smoothness Weighted Reward'] + df_copy['Effort Weighted Reward'])

    # Get the list of recalibrated rewards
    recalibrated_rewards_list = df_copy['Recalibrated Reward'].tolist()

    return df_copy, recalibrated_rewards_list

In [None]:
# Apply the function to recalibrate rewards
recalibrated_df, recalibrated_rewards_list = recalibrate_rewards(reward_df)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(recalibrated_df)

In [None]:
recalibrated_df.to_pickle('/content/drive/MyDrive/data3_rp1/2_trajectories/1_human_feedback/2_Hf_D_Aggressive_Reacher_df_5.pkl')         # Update directory location 2

In [None]:
# To access the reward for a specific step:
for i, feedback in enumerate(recalibrated_rewards_list):
    human_recalibrated_reward_for_step = recalibrated_rewards_list[i]
    print(f"Recalibrated reward for step {i}: {human_recalibrated_reward_for_step}")

In [None]:
# Step A.5.1: CUSTOM REWARD FUNCTION
def custom_reward(self, env, state, action, next_state, reward, done):
    # Access and recalculate the reward using human_feedback_data or recalibrate_rewards_human function
    global step_counter
    try:
        step_counter
    except NameError:
        step_counter = 0

    reward = recalibrated_rewards_list[step_counter]
    step_counter += 1
    return reward

# Create a new environment class that wraps your original environment and overrides the default reward function with your custom function
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        reward = custom_reward(self, self.env, self.last_obs, action, next_state, reward, done)
        # custom_reward should be defined and accessible to your class
        self.last_obs = next_state
        return next_state, reward, terminated, truncated, info

    def reset(self, **kwargs):
        global step_counter
        step_counter = 0
        self.last_obs = self.env.reset(**kwargs)[0]  # Assuming Gymnasium env returns (obs, info)
        return self.last_obs, {}  # Assuming Gymnasium env requires (obs, info)
# Create and wrap the environment with your custom reward wrapper
# env_human = CustomRewardWrapper(gym.make('highway-v0'))

PPO training and Training logs

In [None]:
drive_log_dir = "/content/drive/MyDrive/data3_rp1/0_log_dir/2_ppo_reacher_hf_direct_aggressive_8"              # Update directory location 3

In [None]:
# Train PPO with Custom Rewards
def train_ppo_with_custom_rewards(log_dir=drive_log_dir, total_timesteps=200000):
    os.makedirs(log_dir, exist_ok=True)
    env = CustomRewardWrapper(gym.make("Reacher-v5"))
    env = Monitor(env, log_dir)
    model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=total_timesteps)
    model.save('/content/drive/MyDrive/data3_rp1/1_trained_models/2_ppo_reacher_hf_direct_aggressive_8')       # Update directory location 4
    return model, log_dir

In [None]:
# Execute Training and Convergence Tracking
model, log_dir = train_ppo_with_custom_rewards(total_timesteps=200000)

In [None]:
log_path = os.path.join(drive_log_dir, "monitor.csv")
df = pd.read_csv(log_path, skiprows=1)
# Ensure episodes are logged correctly
df.reset_index(inplace=True)
df.rename(columns={"index": "episode", "r": "reward", "l": "length", "t": "time_step"}, inplace=True)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(df.head())