[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RizanSM/zero_shot_llms_in_HIL_RL/blob/main/01_Code/02_Reacher_Env/02_Default_Environment/05_LLM_HF_BF_REACHER/03_Model_testing_LLM_HF_BF_Reacher.ipynb)

In [None]:
# Install the required libraries in your Google Colab environment
!pip install gymnasium mujoco stable-baselines3 -q

In [None]:
# Import the necessary libraries
import gymnasium as gym
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt

In [None]:
from stable_baselines3 import PPO
from google.colab import drive
from google.colab import data_table

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


Average Episodic Reward

In [None]:
def average_episodic_reward(test_trajectory_df):
    # Step 1: Calculate Cumulative Reward Per Episode
    cumulative_rewards = test_trajectory_df.groupby('Episode')['Reward'].sum().reset_index()
    cumulative_rewards.columns = ['episode', 'cumulative_reward']

    # Step 2: Calculate Variance and Standard Deviation of Cumulative Rewards
    variance = cumulative_rewards['cumulative_reward'].var()
    std_dev = cumulative_rewards['cumulative_reward'].std()
    final_reward = cumulative_rewards['cumulative_reward'].mean()

    # Step 4: Return the dataframe containing all results
    return cumulative_rewards, variance, std_dev, final_reward

COMBINING ALL THE DATA FRAMES

In [None]:
def create_cummulative_reward(df1, df2, df3, df4, df5):
    """
    Create a data frame 'ideal_cummulative_reward' that contains:
      - 'episode' column (common across all data frames)
      - 'cumulative_reward_1' to 'cumulative_reward_5' columns from each data frame respectively,
        where each input data frame has columns 'episode' and 'cummulative_reward'.
      - 'mean_cumulative_reward' column containing the row-wise mean of the 5 cumulative rewards.

    Parameters:
      df1, df2, df3, df4, df5 (pd.DataFrame): Data frames with columns 'episode' and 'cummulative_reward'.

    Returns:
      pd.DataFrame: The merged and aggregated data frame.
    """
    # Rename the 'cummulative_reward' column in each data frame to a unique name.
    df1_renamed = df1.rename(columns={'cumulative_reward': 'cumulative_reward_1'})
    df2_renamed = df2.rename(columns={'cumulative_reward': 'cumulative_reward_2'})
    df3_renamed = df3.rename(columns={'cumulative_reward': 'cumulative_reward_3'})
    df4_renamed = df4.rename(columns={'cumulative_reward': 'cumulative_reward_4'})
    df5_renamed = df5.rename(columns={'cumulative_reward': 'cumulative_reward_5'})

    # Merge the data frames on the 'episode' column.
    merged_df = df1_renamed[['episode', 'cumulative_reward_1']].copy()
    merged_df = merged_df.merge(df2_renamed[['episode', 'cumulative_reward_2']], on='episode')
    merged_df = merged_df.merge(df3_renamed[['episode', 'cumulative_reward_3']], on='episode')
    merged_df = merged_df.merge(df4_renamed[['episode', 'cumulative_reward_4']], on='episode')
    merged_df = merged_df.merge(df5_renamed[['episode', 'cumulative_reward_5']], on='episode')

    # Compute the episode-wise mean of the cumulative rewards.
    reward_columns = [
        'cumulative_reward_1',
        'cumulative_reward_2',
        'cumulative_reward_3',
        'cumulative_reward_4',
        'cumulative_reward_5'
    ]
    merged_df['mean_cumulative_reward'] = merged_df[reward_columns].mean(axis=1)

    return merged_df

VARIANCE, STANDARD DEVIATION & FINAL REWARD

In [None]:
def calculate_performance(variance_values, std_dev_values, final_reward_values):
    """
    Calculate performance metrics by computing the mean of each provided list.

    Parameters:
        variance_values (list of float): A list containing 5 variance values.
        std_dev_values (list of float): A list containing 5 standard deviation values.
        final_reward_values (list of float): A list containing 5 final reward values.

    Returns:
        tuple: A tuple containing three values:
            - mean_variance (float): Mean of the variance values.
            - mean_std_dev (float): Mean of the standard deviation values.
            - mean_final_reward (float): Mean of the final reward values.
    """
    mean_variance = sum(variance_values) / len(variance_values)
    mean_std_dev = sum(std_dev_values) / len(std_dev_values)
    mean_final_reward = sum(final_reward_values) / len(final_reward_values)

    return mean_variance, mean_std_dev, mean_final_reward


0. MODEL TESTING (LLM_D_Ideal)

In [None]:
trajectory_df_1 = pd.read_pickle('/content/drive/MyDrive/05_zero_shot_llm_3/03_reacher_data/03_test_trajectories/5_llm_hf_bf/1_llm_hf_bf_reacher_df_1.pkl')   # Update directory location 1
trajectory_df_2 = pd.read_pickle('/content/drive/MyDrive/05_zero_shot_llm_3/03_reacher_data/03_test_trajectories/5_llm_hf_bf/2_llm_hf_bf_reacher_df_1.pkl')   # Update directory location 2
trajectory_df_3 = pd.read_pickle('/content/drive/MyDrive/05_zero_shot_llm_3/03_reacher_data/03_test_trajectories/5_llm_hf_bf/3_llm_hf_bf_reacher_df_1.pkl')   # Update directory location 3
trajectory_df_4 = pd.read_pickle('/content/drive/MyDrive/05_zero_shot_llm_3/03_reacher_data/03_test_trajectories/5_llm_hf_bf/4_llm_hf_bf_reacher_df_1.pkl')   # Update directory location 4
trajectory_df_5 = pd.read_pickle('/content/drive/MyDrive/05_zero_shot_llm_3/03_reacher_data/03_test_trajectories/5_llm_hf_bf/5_llm_hf_bf_reacher_df_1.pkl')   # Update directory location 5

TESTING THE MODEL BY ANALYZING THE DATA FRAME <br>



1. LLM HF BF Reacher  data frame 1<br>
trajectory_df_1 ---> 1_llm_hf_bf_reacher_df

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_1)

Output hidden; open in https://colab.research.google.com to view.

A. Average Episodic Reward (trajectory_df_1)

In [None]:
learning_curve_result_df_1, variance_1, std_dev_1, average_episodic_reward_1 = average_episodic_reward(trajectory_df_1)
print("Variance:", variance_1)
print("Standard Deviation:", std_dev_1)
print("Average Episodic Reward:", average_episodic_reward_1)

Variance: 4.589736175831342e-28
Standard Deviation: 2.1423669563899043e-14
Average Episodic Reward: -9.90672172165043


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(learning_curve_result_df_1)

Unnamed: 0,episode,cumulative_reward
0,1,-9.906722
1,2,-9.906722
2,3,-9.906722
3,4,-9.906722
4,5,-9.906722
...,...,...
95,96,-9.906722
96,97,-9.906722
97,98,-9.906722
98,99,-9.906722


2. LLM HF BF Reacher  data frame 2<br>
trajectory_df_2 ---> 2_llm_hf_bf_reacher_df

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_2)

Output hidden; open in https://colab.research.google.com to view.

A. Average Episodic Reward (trajectory_df_2)

In [None]:
learning_curve_result_df_2, variance_2, std_dev_2, average_episodic_reward_2 = average_episodic_reward(trajectory_df_2)
print("Variance:", variance_2)
print("Standard Deviation:", std_dev_2)
print("Average Episodic Reward:", average_episodic_reward_2)

Variance: 3.187316788771765e-30
Standard Deviation: 1.785305796991587e-15
Average Episodic Reward: -13.151865677796875


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(learning_curve_result_df_2)

Unnamed: 0,episode,cumulative_reward
0,1,-13.151866
1,2,-13.151866
2,3,-13.151866
3,4,-13.151866
4,5,-13.151866
...,...,...
95,96,-13.151866
96,97,-13.151866
97,98,-13.151866
98,99,-13.151866


3. LLM HF BF Reacher  data frame 3<br>
trajectory_df_3 ---> 3_llm_hf_bf_reacher_df

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_3)

Output hidden; open in https://colab.research.google.com to view.

A. Average Episodic Reward (trajectory_df_3)

In [None]:
learning_curve_result_df_3, variance_3, std_dev_3, average_episodic_reward_3 = average_episodic_reward(trajectory_df_3)
print("Variance:", variance_3)
print("Standard Deviation:", std_dev_3)
print("Average Episodic Reward:", average_episodic_reward_3)

Variance: 5.099706862034824e-29
Standard Deviation: 7.141223187966347e-15
Average Episodic Reward: -7.744439032232626


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(learning_curve_result_df_3)

Unnamed: 0,episode,cumulative_reward
0,1,-7.744439
1,2,-7.744439
2,3,-7.744439
3,4,-7.744439
4,5,-7.744439
...,...,...
95,96,-7.744439
96,97,-7.744439
97,98,-7.744439
98,99,-7.744439


4. LLM HF BF Reacher  data frame 4<br>
trajectory_df_4 ---> 4_llm_hf_bf_reacher_df


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_4)

Output hidden; open in https://colab.research.google.com to view.

A. Average Episodic Reward (trajectory_df_4)

In [None]:
learning_curve_result_df_4, variance_4, std_dev_4, average_episodic_reward_4 = average_episodic_reward(trajectory_df_4)
print("Variance:", variance_4)
print("Standard Deviation:", std_dev_4)
print("Average Episodic Reward:", average_episodic_reward_4)

Variance: 5.099706862034824e-29
Standard Deviation: 7.141223187966347e-15
Average Episodic Reward: -14.735069357096556


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(learning_curve_result_df_4)

Unnamed: 0,episode,cumulative_reward
0,1,-14.735069
1,2,-14.735069
2,3,-14.735069
3,4,-14.735069
4,5,-14.735069
...,...,...
95,96,-14.735069
96,97,-14.735069
97,98,-14.735069
98,99,-14.735069


5. LLM HF BF Reacher  data frame 5<br>
trajectory_df_5 ---> 5_llm_hf_bf_reacher_df


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(trajectory_df_5)

Output hidden; open in https://colab.research.google.com to view.

A. Average Episodic Reward (trajectory_df_5)

In [None]:
learning_curve_result_df_5, variance_5, std_dev_5, average_episodic_reward_5 = average_episodic_reward(trajectory_df_5)
print("Variance:", variance_5)
print("Standard Deviation:", std_dev_5)
print("Average Episodic Reward:", average_episodic_reward_5)

Variance: 1.274926715508706e-27
Standard Deviation: 3.5706115939831735e-14
Average Episodic Reward: -15.278771485605029


In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable(learning_curve_result_df_5)

Unnamed: 0,episode,cumulative_reward
0,1,-15.278771
1,2,-15.278771
2,3,-15.278771
3,4,-15.278771
4,5,-15.278771
...,...,...
95,96,-15.278771
96,97,-15.278771
97,98,-15.278771
98,99,-15.278771


COMBINED DATA FRAMES - LLM-D-IDEAL

In [None]:
cummulative_reward = create_cummulative_reward(
        learning_curve_result_df_1,
        learning_curve_result_df_2,
        learning_curve_result_df_3,
        learning_curve_result_df_4,
        learning_curve_result_df_5
    )

data_table.enable_dataframe_formatter()
data_table.DataTable(cummulative_reward)

Unnamed: 0,episode,cumulative_reward_1,cumulative_reward_2,cumulative_reward_3,cumulative_reward_4,cumulative_reward_5,mean_cumulative_reward
0,1,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
1,2,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
2,3,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
3,4,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
4,5,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
...,...,...,...,...,...,...,...
95,96,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
96,97,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
97,98,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373
98,99,-9.906722,-13.151866,-7.744439,-14.735069,-15.278771,-12.163373


VARIANCE, STANDARD DEVIATION & FINAL REWARD : LLM-D-IDEAL

In [None]:
 # Organize the values into lists
variance_list = [variance_1, variance_2, variance_3, variance_4, variance_5]
std_dev_list = [std_dev_1, std_dev_2, std_dev_3, std_dev_4, std_dev_5]
final_reward_list = [average_episodic_reward_1, average_episodic_reward_2, average_episodic_reward_3, average_episodic_reward_4, average_episodic_reward_5]

# Call the function to compute the mean performance values
mean_variance, mean_std_dev, mean_final_reward = calculate_performance(variance_list, std_dev_list, final_reward_list)

# Print the results
print("LLM-HF-BF Mean Variance:", mean_variance)
print("LLM-HF-BF  Mean Standard Deviation:", mean_std_dev)
print("LLM-HF-BF  Mean Average Episodic Reward:", mean_final_reward)


LLM-HF-BF Mean Variance: 3.678163574242617e-28
LLM-HF-BF  Mean Standard Deviation: 1.463950753533101e-14
LLM-HF-BF  Mean Average Episodic Reward: -12.163373454876304
