<a href="https://colab.research.google.com/github/RizanSM/zero_shot_llms_in_HIL_RL/blob/main/02_reacher_env/06_LLM_HF_BF/01_Policy_Training_LLM_HF_BF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries in your Google Colab environment
!pip install gymnasium[mujoco] mujoco stable-baselines3 ollama -q

In [None]:
# Import the necessary libraries
import gymnasium as gym
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
import ollama
import matplotlib.animation as animation
import re

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.interpolate import interp1d
from tqdm import tqdm
from sklearn.metrics import silhouette_score

from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from google.colab import data_table
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Load the dataframe back from the pickle file
trajectory_df = pd.read_pickle('/content/drive/MyDrive/data3_rp1/2_trajectories/1_human_feedback/2_Hf_D_Aggressive_Reacher_df_5.pkl')      # Update directory location 1

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df.head())

In [None]:
# Check the data type of each column
print(type(trajectory_df['Episode'][0]))
print(type(trajectory_df['Timestep'][0]))
print(type(trajectory_df['State'][0]))
print(type(trajectory_df['Action'][0]))
print(type(trajectory_df['Reward'][0]))
print(type(trajectory_df['Next State'][0]))

In [None]:
def apply_pca(trajectory_df):
    """
    Apply PCA to reduce trajectory features to 3 principal components.
    """
    # Extract numerical values from state and action
    state_columns = [f"state_{i}" for i in range(len(trajectory_df["State"].iloc[0]))]
    action_columns = [f"action_{i}" for i in range(len(trajectory_df["Action"].iloc[0]))]

    # Expand state and action columns
    state_values = np.vstack(trajectory_df["State"].values)
    action_values = np.vstack(trajectory_df["Action"].values)

    # Create a new DataFrame with extracted features
    feature_df = pd.DataFrame(state_values, columns=state_columns)
    action_df = pd.DataFrame(action_values, columns=action_columns)

    # Concatenate extracted values with reward-related features
    features = ["Reward", "Distance Reward", "Smoothness Reward", "Effort Reward", "Smoothness Weighted Reward", "Effort Weighted Reward", "Recalibrated Reward"]
    feature_df = pd.concat([feature_df, action_df, trajectory_df[features].reset_index(drop=True)], axis=1)

    # Apply PCA
    pca = PCA(n_components=3)
    pca_features = pca.fit_transform(feature_df)

    # Add PCA components to original dataframe
    trajectory_df_pca = trajectory_df.copy()
    trajectory_df_pca["PC1"] = pca_features[:, 0]
    trajectory_df_pca["PC2"] = pca_features[:, 1]
    trajectory_df_pca["PC3"] = pca_features[:, 2]

    return trajectory_df_pca, pca

In [None]:
def find_optimal_clusters(data, max_clusters=10):
    """
    Determine the optimal number of clusters using the Silhouette Score.
    """
    best_score = -1
    best_k = 2
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(data)
        score = silhouette_score(data, cluster_labels)
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

In [None]:
def apply_clustering(trajectory_df_pca):
    """
    Apply KMeans clustering to select representative timesteps for LLM evaluation.
    """
    features = ["PC1", "PC2", "PC3"]
    X = trajectory_df_pca[features]
    pca_features = X
    # optimal_clusters = find_optimal_clusters(pca_features)
    optimal_clusters = 10
    print(f"Optimal number of clusters: {optimal_clusters}")
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
    trajectory_df_pca["Cluster"] = kmeans.fit_predict(trajectory_df_pca[["PC1", "PC2", "PC3"]])

    # Select one representative per cluster
    cluster_representatives = trajectory_df_pca.groupby("Cluster").first().reset_index()

    return trajectory_df_pca, cluster_representatives, kmeans

In [None]:
def visualize_clusters(trajectory_df_pca):
    """
    Visualize PCA-clustered data in a 3D scatter plot.
    """
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(
        trajectory_df_pca["PC1"],
        trajectory_df_pca["PC2"],
        trajectory_df_pca["PC3"],
        c=trajectory_df_pca["Cluster"], cmap='viridis', alpha=0.6
    )
    plt.colorbar(scatter, label="Cluster ID")
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    plt.title("3D PCA-Clustering Visualization")
    plt.show()


In [None]:
trajectory_df_pca_1, pca = apply_pca(trajectory_df)

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_pca_1.head())

In [None]:
trajectory_df_pca, cluster_representatives, kmeans = apply_clustering(trajectory_df_pca_1)

In [None]:
# Display the data frame
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_pca)

In [None]:
print(cluster_representatives)
print(kmeans)

In [None]:
visualize_clusters(trajectory_df_pca)

B: LLM FEEDBACK IMPLEMENTATION <br>
SECTION B.0: LOADING THE LLM
*   Step B.0.1: Install Required Libraries
*   Step B.0.2: Setting the environment varaible  
*   Step B.0.3: Setup and Load the Pretrained LLM

In [None]:
!pip install colab-xterm

In [None]:
%load_ext colabxterm

In [None]:
%xterm
# curl -fsSL https://ollama.com/install.sh | sh
# ollama serve & ollama pull mistral
# ollama list
# ollama show mistral llama3.2

In [None]:
def visualize_feedback_progress(progress):
    fig, ax = plt.subplots()
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Progress (%)")
    ax.set_ylabel("Completion")

    def update(frame):
        ax.clear()
        ax.barh(["LLM Feedback"], [frame], color='blue')
        ax.set_xlim(0, 100)
        ax.set_title("Live LLM Feedback Collection Progress")

    ani = animation.FuncAnimation(fig, update, frames=progress, repeat=False)
    plt.show()

In [None]:
def get_llm_feedback(state, action, reward, next_state, episode_num, time_step, distance_reward, smoothness_reward, effort_reward, smoothness_weighted_reward, effort_weighted_reward, recalibrated_reward, pc1, pc2, pc3):
    """
    Function to get LLM feedback on whether the recalibrated reward is correct or biased.
    """
    prompt = f"""
    You are an expert in reinforcement learning and robotic control environments.
    You will analyze agent trajectories from the Reacher-v5 environment and assess whether the recalibrated reward
    accurately reflects the agent's performance in reaching and stabilizing the target.

    Data details:
      Episode: {episode_num}
      Time Step: {time_step}
      State: {state}
      Action: {action}
      Reward: {reward}
      Next State: {next_state}
      Distance Reward: {distance_reward}
      Smoothness Reward: {smoothness_reward}
      Effort Reward: {effort_reward}
      Smoothness Weighted Reward: {smoothness_weighted_reward}
      Effort Weighted Reward: {effort_weighted_reward}
      Recalibrated Reward: {recalibrated_reward}
      PC1: {pc1} (Captures trajectory efficiency and distance control)
      PC2: {pc2} (Emphasizes smoothness and effort optimization)
      PC3: {pc3} (Represents overall stability and control)

    Your task:
     - If the recalibrated reward correctly reflects the agent's performance, respond with:
          "Correct reward allotted"
          llm_reward = {recalibrated_reward}

     - If the recalibrated reward is biased, respond with:
         "Biased reward allotted"
         llm_reward = Suggested appropriate reward based on reinforcement learning principles.
    """

    answer = ollama.generate(model="mistral", prompt=prompt, options={"temperature": 0.6, "seed": 4})
    feedback = answer['response'].strip()
    return feedback

In [None]:
def collect_llm_feedback_cluster(cluster_representatives):
    llm_feedback_data = {}
    for _, row in tqdm(cluster_representatives.iterrows(), total=len(cluster_representatives)):
        feedback = get_llm_feedback(
            row["State"], row["Action"], row["Reward"], row["Next State"],
            row["Episode"], row["Timestep"], row["Distance Reward"], row["Smoothness Reward"], row["Effort Reward"],
            row["Smoothness Weighted Reward"], row["Effort Weighted Reward"], row["Recalibrated Reward"],
            row["PC1"], row["PC2"], row["PC3"]
        )
        llm_feedback_data[row["Cluster"]] = feedback
    return llm_feedback_data

In [None]:
def interpolate_llm_scores(trajectory_df_pca, llm_feedback_data):
    """
    Interpolates LLM feedback scores across all timesteps.
    """
    trajectory_df_pca["LLM_Recalibrated_Reward"] = trajectory_df_pca["Cluster"].map(llm_feedback_data)
    trajectory_df_pca["LLM_Recalibrated_Reward"] = trajectory_df_pca["LLM_Recalibrated_Reward"].interpolate()
    return trajectory_df_pca

In [None]:
llm_feedback_data = collect_llm_feedback_cluster(cluster_representatives)


In [None]:
trajectory_df_pca = interpolate_llm_scores(trajectory_df_pca, llm_feedback_data)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_pca)

In [None]:
# Group the data by 'episode'
episode_data = trajectory_df_pca.groupby('Episode')
# Loop through each episode
for Episode, data in episode_data:

    cluster_list = data['Cluster'].tolist()

    # Count the total number of time steps in the episode
    total_timesteps = data['Timestep'].max() + 1  # Assuming time_step starts from 0

    print(f"Episode {Episode}:Total timesteps {total_timesteps}: {cluster_list}")

Section B.2: REWARD MODELLING(LLM FEEDBACK)

*   Step B.2.1: Recalibrate Reward Based on LLM Feedback
*   Step 3.2.2: Displaying the recalibrated rewards based on LLM feedback
*   Step 3.2.3: Access the reward for a specific step

In [None]:
def extract_llm_scores(trajectory_df: pd.DataFrame, trajectory_df_pca: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts llm_score values from the 'LLM_Adjusted_Score' column in trajectory_df_pca and adds them
    to a copy of trajectory_df as two new columns: 'LLM_feedback_score' and 'LLM_score'.

    Args:
        trajectory_df (pd.DataFrame): The original trajectory data.
        trajectory_df_pca (pd.DataFrame): Data containing the 'LLM_Adjusted_Score' column with feedback summaries.

    Returns:
        pd.DataFrame: Updated DataFrame with extracted LLM scores.
    """

    def extract_llm_score(summary: str) -> float:
        match = re.search(r'llm_reward\s*=\s*([-+]?[0-9]*\.?[0-9]+)', str(summary))
        return float(match.group(1)) if match else None

    updated_df = trajectory_df.copy()
    updated_df["LLM_feedback_reward"] = trajectory_df_pca["LLM_Recalibrated_Reward"].apply(
        lambda x: f"llm_reward={extract_llm_score(x)}" if extract_llm_score(x) is not None else None
    )
    updated_df["LLM_reward"] = trajectory_df_pca["LLM_Recalibrated_Reward"].apply(extract_llm_score)

    # Replace NaN values in LLM_score with zero
    updated_df["LLM_reward"].fillna(0, inplace=True)

    return updated_df

In [None]:
llm_feedback_df = extract_llm_scores(trajectory_df, trajectory_df_pca)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(llm_feedback_df)

In [None]:
# Group the data by 'episode'
episode_data = llm_feedback_df.groupby('Episode')
# Loop through each episode
for episode, data in episode_data:
    # Extract lane indices
    llm_score_list = data['LLM_reward'].tolist()

    # Count the total number of time steps in the episode
    total_timesteps = data['Timestep'].max() + 1  # Assuming time_step starts from 0

    print(f"Episode {Episode}:Total timesteps {total_timesteps}: {llm_score_list}")

In [None]:
# Step B.2.3: Recalibrate Reward Based on LLM Feedback
# Function to recalibrate the rewards
def recalibrate_rewards(df):
    # Create a copy of the dataframe
    df_copy = df.copy()

    alpha = -1
    # Create the 'Recalibrated_rewards' column
    df_copy['New_Recalibrated_rewards'] = df_copy['Reward'] + alpha * (df_copy['LLM_reward'])

    # Get the list of recalibrated rewards
    new_recalibrated_rewards_list = df_copy['New_Recalibrated_rewards'].tolist()

    return df_copy, new_recalibrated_rewards_list

In [None]:
# Apply the function to recalibrate rewards
recalibrated_df, recalibrated_rewards_list = recalibrate_rewards(llm_feedback_df)

In [None]:
recalibrated_df.to_pickle('/content/drive/MyDrive/data3_rp1/2_trajectories/3_llm_hf_bf/1_llm_hd_bf_df_reacher_5.pkl')      # Update directory location 2

In [None]:
# To access the reward for a specific step:
for i, feedback in enumerate(recalibrated_rewards_list):
    human_recalibrated_reward_for_step = recalibrated_rewards_list[i]
    print(f"Recalibrated reward for step {i}: {human_recalibrated_reward_for_step}")

SECTION A.5: MODEL TRAINING(HUMAN FEEDBACK DIRECT- IDEAL CASE SCENARIO)
*   Step A.5.1: CUSTOM REWARD FUNCTION
*   Step A.5.2: LOAD THE SAVED INITIALLY TRAINED PPO MODEL FROM GOOGLE DRIVE
*   Step A.5.3: TRAIN/UPDATE PPO MODEL WITH RECALIBRATED REWARD
*   Step A.5.4: SAVE THE TRAINED MODEL(HF_IDEAL) FOR TESTING

In [None]:
# Step A.5.1: CUSTOM REWARD FUNCTION
def custom_reward(self, env, state, action, next_state, reward, done):
    # Access and recalculate the reward using human_feedback_data or recalibrate_rewards_human function
    global step_counter
    try:
        step_counter
    except NameError:
        step_counter = 0

    reward = recalibrated_rewards_list[step_counter]
    step_counter += 1
    return reward

# Create a new environment class that wraps your original environment and overrides the default reward function with your custom function
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        next_state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        reward = custom_reward(self, self.env, self.last_obs, action, next_state, reward, done)
        # custom_reward should be defined and accessible to your class
        self.last_obs = next_state
        return next_state, reward, terminated, truncated, info

    def reset(self, **kwargs):
        global step_counter
        step_counter = 0
        self.last_obs = self.env.reset(**kwargs)[0]  # Assuming Gymnasium env returns (obs, info)
        return self.last_obs, {}  # Assuming Gymnasium env requires (obs, info)
# Create and wrap the environment with your custom reward wrapper
# env_human = CustomRewardWrapper(gym.make('highway-v0'))

PPO training and Training logs

In [None]:
drive_log_dir = "/content/drive/MyDrive/data3_rp1/0_log_dir/5_ppo_reacher_llm_hf_bf_8"          # Update directory location 3

In [None]:
# Train PPO with Custom Rewards
def train_ppo_with_custom_rewards(log_dir=drive_log_dir, total_timesteps=200000):
    os.makedirs(log_dir, exist_ok=True)
    env = CustomRewardWrapper(gym.make("Reacher-v5"))
    env = Monitor(env, log_dir)
    model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=total_timesteps)
    model.save('/content/drive/MyDrive/data3_rp1/1_trained_models/5_ppo_reacher_llm_hf_bf_8')   # Update directory location 4
    return model, log_dir

In [None]:
# Execute Training and Convergence Tracking
model, log_dir = train_ppo_with_custom_rewards(total_timesteps=200000)

In [None]:
log_path = os.path.join(drive_log_dir, "monitor.csv")
df = pd.read_csv(log_path, skiprows=1)
# Ensure episodes are logged correctly
df.reset_index(inplace=True)
df.rename(columns={"index": "episode", "r": "reward", "l": "length", "t": "time_step"}, inplace=True)

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(df.head())