In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
from tqdm import tqdm


# Train data analysis

In [2]:
train_data = dict()

envs = ['halfcheetah', 'hopper', 'walker2d']
datasets = ['expert', 'medium_replay', 'medium']
data_features = ['observations', 'next_observations', 'actions', 'rewards', 'terminals']

for e in envs:
    for d in datasets:
        with open(f'./data_plus_plus/{e}-{d}-v2.pkl', 'rb') as f:
            train_data[f'{e}-{d}'] = pickle.load(f)

In [3]:
# for each dataset, combine all trajectories into a single long one
def concatenate_dataset(dataset):
    keys = list(dataset[0].keys())

    total_dict = dict()
    for key in keys:
        total_dict[key] = dataset[0][key]

    for traj in dataset[1:]:
        for key in keys:
            total_dict[key] = np.append(total_dict[key], traj[key], axis=0)
    
    return total_dict

concatenated_train_data = {k: concatenate_dataset(v) for k, v in tqdm(train_data.items())}


100%|██████████| 9/9 [02:18<00:00, 15.40s/it]


In [4]:
idxs = [f'{e}-{d}' for e in envs for d in datasets]
df_train_data = pd.DataFrame(index=idxs)

df_train_data['mean_terminals'] = [np.mean([sum(data['terminals']) for data in train_data[idx]]) for idx in idxs]
df_train_data['mean_terminal_reward'] = [np.mean([sum(data['rewards']) for data in train_data[idx]]) for idx in idxs]
df_train_data['std_terminal_reward'] = [np.std([sum(data['rewards']) for data in train_data[idx]]) for idx in idxs]
df_train_data['n_traj'] = [len(train_data[idx]) for idx in idxs]

df_train_data

Unnamed: 0,mean_terminals,mean_terminal_reward,std_terminal_reward,n_traj
halfcheetah-expert,0.0,10656.42646,441.682728,1000
halfcheetah-medium_replay,0.0,3093.285581,1680.693937,202
halfcheetah-medium,0.0,4770.334765,355.750394,1000
hopper-expert,0.08666,3511.357707,328.585955,1027
hopper-medium_replay,0.803038,467.302044,511.025583,2041
hopper-medium,0.999543,1422.05618,378.953696,2186
walker2d-expert,0.001,4920.507113,136.394925,1000
walker2d-medium_replay,0.723696,682.701247,895.955582,1093
walker2d-medium,0.431092,2852.088416,1095.443313,1190


In [5]:
from sklearn.linear_model import LinearRegression


def get_forward_reward(positions):
    return positions[1:] - positions[:-1]


def get_control_cost(actions):
    return np.sum(np.square(actions), axis=1)


# calculate the weights of the individual parts of the reward using linear regression
def get_coef(data_temp, use_healthy_reward=False, use_intercept=False):

    forward_reward = get_forward_reward(data_temp['infos/qpos'][:,0])
    ctrl_cost = get_control_cost(data_temp['actions'])

    target_reward = data_temp['rewards'][:-1]

    if use_healthy_reward:
        healthy_reward = np.array(~data_temp['terminals'], dtype=int)
        X = np.stack((forward_reward, ctrl_cost[:-1], healthy_reward[:-1]), axis=0).T
    else:
        X = np.stack((forward_reward, ctrl_cost[:-1]), axis=0).T

    reg = LinearRegression(fit_intercept=False).fit(X, target_reward)
    
    return reg, X, target_reward

# r_score = reg.score(X, target_reward)
# coef = reg.coef_ # forward_reward_weight, ctrl_cost_weight, (healthy_reward)
# predictions = reg.predict(X)

In [29]:
all_r_scores, all_coefs, all_traj_lens = [], [], []

for idx in idxs:
    trajs = train_data[idx]

    use_healthy_reward = idx.split('-')[0] == 'halfcheetah'

    r_scores, coefs, total_traj_len = [], [], 0
    for traj in trajs:
        
        if 7 < len(traj): # dunno why 7 works but less doesn't
            reg, X, target_reward = get_coef(traj, use_healthy_reward=use_healthy_reward)

            traj_len = len(traj)
            total_traj_len += traj_len
            r_scores.append(reg.score(X, target_reward))
            coefs.append(reg.coef_)
    
    all_traj_lens.append(total_traj_len)
    all_r_scores.append(r_scores)
    all_coefs.append(coefs)

df_train_data['r_scores'] = np.array(all_r_scores)
df_train_data['coefs'] = np.array(all_coefs)



In [None]:
# half cheetah

# choose dataset
data_temp = train_data['halfcheetah-expert'][1]

reg, X, target_reward = get_coef(data_temp, use_healthy_reward=True)

print(reg.coef_, reg.score(X, target_reward))

predictions = reg.predict(X)

for i, X_1dim in enumerate(X.T):
    
    temp_X = X.copy()
    temp_X[:,i] = 0

    # plt.scatter(X_1dim, predictions - reg.predict(temp_X))
    plt.scatter(X_1dim, predictions - target_reward, c='C0')
    plt.hlines(0, min(X_1dim), max(X_1dim), linestyles='--', colors='C1')
    plt.show()


plt.scatter(predictions, predictions - target_reward, c='C0')
plt.hlines(0, min(predictions), max(predictions), linestyles='--', colors='C1')
plt.show()

In [None]:
# de to andre

# choose dataset
data_temp = train_data['walker2d-expert'][1]

# get rewards
forward_reward = get_forward_reward(data_temp['infos/qpos'][:,0])
ctrl_cost = get_control_cost(data_temp['actions'])
# healthy_reward = np.array(~data_temp['terminals'], dtype=int)
target_reward = data_temp['rewards'][:-1]

# define xs
X = np.stack((forward_reward, ctrl_cost[:-1]), axis=0).T

reg = LinearRegression().fit(X, target_reward)
predictions = reg.predict(X)
r_score = reg.score(X, target_reward)

print(reg.coef_, r_score) # (forward_reward_weight, ctrl_cost_weight, healthy_reward), r_score

plt.scatter(predictions, predictions - target_reward, c='C0')
plt.hlines(0, min(predictions), max(predictions), linestyles='--', colors='C1')
plt.show()


# mask for points with low error
mask = abs(predictions - target_reward ) < 0.002

X_masked = np.stack((forward_reward, ctrl_cost[:-1]), axis=0).T[mask]
target_reward_masked = data_temp['rewards'][:-1][mask]

reg = LinearRegression().fit(X_masked, target_reward_masked)
predictions = reg.predict(X_masked)
r_score = reg.score(X_masked, target_reward_masked)

print(reg.coef_, r_score) # (forward_reward_weight, ctrl_cost_weight, healthy_reward), r_score

plt.scatter(predictions, predictions - target_reward_masked, c='C0')
plt.hlines(0, min(predictions), max(predictions), linestyles='--', colors='C1')
plt.show()

In [None]:
forward_reward_weights, ctrl_cost_weights, healthy_rewards, r_scores = [], [], [], []

for idx in idxs:
    e, d = idx.split('-')
    total_dict = concatenated_train_data[idx]

    if 1:
        (forward_reward_weight, ctrl_cost_weight, healthy_reward), r_score = get_coef(total_dict)

        forward_reward_weights.append(forward_reward_weight)
        ctrl_cost_weights.append(ctrl_cost_weight)
        healthy_rewards.append(healthy_reward)
        r_scores.append(r_score)
    else:
        forward_reward_weights.append(0)
        ctrl_cost_weights.append(0)
        healthy_rewards.append(0)
        r_scores.append(0)

df_train_data['forward_reward_weight'] = forward_reward_weights
df_train_data['ctrl_cost_weight'] = ctrl_cost_weights
df_train_data['healthy_reward'] = healthy_rewards
df_train_data['r_score'] = r_scores

In [None]:
df_train_data

In [None]:
data_temp = concatenated_train_data['hopper-medium']


forward_reward = data_temp['observations'][:,5]
ctrl_cost = (data_temp['actions'] ** 2).sum(axis=1)
healthy_reward = np.array(~data_temp['terminals'], dtype=int)

target_reward = data_temp['rewards']


X = np.stack((forward_reward[:-1], ctrl_cost[1:], healthy_reward[1:]), axis=0).T
# X = forward_reward[:-1].reshape(-1, 1)
y = target_reward[:-1]

reg = LinearRegression().fit(X, y)

r_score = reg.score(X, y)

reg.coef_, r_score

# Plots

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(12, 12)) # note that halfcheetah does not terminate early, has no "survive" feature
fig.suptitle('Detribution of trajectory lenghts', size=30)

for ax, (key, data) in zip(axs.flatten(), train_data.items()):
    env, dataset = key.split('-')
    lengths = [len(data['rewards']) for data in train_data[key]]
    ax.set_title(f"{env.capitalize()} {dataset}")
    n, bins, _ = ax.hist(lengths, bins=20)

    median_val = np.median(lengths)
    ax.vlines(median_val, 0, max(n), linestyles='--', colors='C3', label=f'Meadian ({int(median_val)})')
    
    mean_val = np.mean(lengths)
    ax.vlines(mean_val, 0, max(n), linestyles='--', colors='C1', label=f'Mean ({int(mean_val)})')

    ax.legend()

plt.show()

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(12, 12))
fig.suptitle('Destribution of trejectory total reward', size=30)

for ax, (key, data) in zip(axs.flatten(), train_data.items()):
    env, dataset = key.split('-')
    terminal_rewards = [sum(data['rewards']) for data in train_data[key]]
    ax.set_title(f"{env.capitalize()} {dataset}")
    n, bins, _ = ax.hist(terminal_rewards, bins=20)

    median_val = np.median(terminal_rewards)
    ax.vlines(median_val, 0, max(n), linestyles='--', colors='C3', label=f'Meadian ({int(median_val)})')
    
    mean_val = np.mean(terminal_rewards)
    ax.vlines(mean_val, 0, max(n), linestyles='--', colors='C1', label=f'Mean ({int(mean_val)})')

    ax.legend()

plt.show()

# Evaluation analysis

In [None]:
eval_path = './evaluation_data/'
raw_eval_dict = dict()

for env in os.listdir(eval_path):
    eval_path = './evaluation_data/'

    if not env == 'first_run_expert':

        eval_path += env + '/'
        for dataset in os.listdir(eval_path):
            
            eval_path += dataset + '/'
            for filename in os.listdir(eval_path):
                with open(eval_path + filename, 'rb') as f:
                    raw_eval_dict[filename] = pickle.load(f)

In [None]:
def read_data_to_DataFrama(raw_dict):

    return_keys = [key for key in raw_dict.keys() if '_returns' in key]
    temp_dict = dict()
    for key in return_keys:
        target_return, _ = key.split('_')
        target_return = float(target_return)
        temp_dict[target_return] = raw_dict[key]
    
    return pd.DataFrame(temp_dict)

eval_dict = {k: read_data_to_DataFrama(v) for k, v in raw_eval_dict.items()}

In [None]:
seed = '42'
conf_int = (0.2, 0.8)


def get_plot_statistics(df, conf_int=(0.05, 0.95)):

    target_rewards = df.columns.to_numpy()
    reward_mean = df.to_numpy().mean(axis=0)
    reward_median = np.quantile(df.to_numpy(), 0.5, axis=0)
    # reward_std = df.to_numpy().std(axis=0)
    q_low, q_high = np.quantile(df.to_numpy(), conf_int, axis=0)

    return target_rewards, reward_mean, reward_median, q_low, q_high


plot_dict = {k: get_plot_statistics(v, conf_int=conf_int) for k, v in eval_dict.items() if k.split('-')[-1] == seed}

side_len = int(np.ceil(np.sqrt(len(plot_dict))))
fig, axs = plt.subplots(side_len, side_len, figsize=(side_len * 16 / 3, side_len * 16 / 3))

for ax, (exp_name, values) in zip(axs.flatten(), plot_dict.items()):
    x, y_mean, y_median, q_low, q_high = values

    x_linear = np.linspace(min(x), max(x))
    ax.plot(x_linear, x_linear, '--', c='C2', label='Oracle')
    ax.plot(x, y_mean, c='C0', label='Mean DT')
    ax.plot(x, y_median, c='C3', label='Median DT')
    ax.fill_between(x, q_low, q_high, color='b', alpha=.1, label=f'conf int {int((conf_int[1]-conf_int[0])*100)}%')

    train_data_name = '-'.join(np.take(exp_name.split('-'), (3,4)))
    terminal_rewards = [sum(data['rewards']) for data in train_data[train_data_name]]
    ax.vlines(max(terminal_rewards), min(min(y_mean), min(x_linear)),
        max(max(y_mean), max(x_linear)), linestyles='--', colors='C1', label='Best Trajectory in Dataset')

    iter_name, _, _, env_name, data_name, _ = exp_name.split('-')
    ax.set_title(f"{env_name.capitalize()} {data_name}, {iter_name[4:]} iterations" )
    ax.legend(loc="upper left")
    ax.set_xlabel('Target reward')
    ax.set_ylabel('Actual reward')

plt.show()