In [None]:
import pandas as pd
from utils.misc import add_column
import plotly.express as px

In [None]:
df_loss = pd.read_csv('result/dqn_2_5/losses.csv')
df_reward = pd.read_csv('result/dqn_2_5/reward_history.csv')

In [None]:
px.line(df_loss, x='episode', y=['loss_final_state', 'loss_other'], hover_data=['step'])

In [None]:
# to make a meaningful plot (before we set up validation data and evaluate after every-n epochs),
# subset to example IDs that have at least one evaluation point <=ep_100 and one >=ep_700
df_tmp_min = df_reward[['example_id', 'episode']].groupby('example_id', as_index=False).agg('min')
df_tmp_max = df_reward[['example_id', 'episode']].groupby('example_id', as_index=False).agg('max')
example_id_satisfying_condition = set(df_tmp_min[df_tmp_min['episode'] <= 100]['example_id']).intersection(set(df_tmp_max[df_tmp_max['episode'] >= 700]['example_id']))
df_reward = df_reward[df_reward['example_id'].isin(example_id_satisfying_condition)]


In [None]:
# # reward of different examples have different dynamic scale, to make the plot more readable,
# # we normalize everything to be within [0, 1]
# df_r_min = df_reward[['example_id', 'reward']].groupby('example_id', as_index=False).agg('min')
# df_r_max = df_reward[['example_id', 'reward']].groupby('example_id', as_index=False).agg('max')
# df_tmp = pd.merge(df_reward, df_r_min.rename(columns={'reward': 'r_min'}), 
#                   on='example_id', how='left')
# df_tmp = pd.merge(df_tmp, df_r_max.rename(columns={'reward': 'r_max'}), 
#                   on='example_id', how='left')
# # linear transformation so that r_min -> 0, r_max -> 1
# # y = a * x + b, where a = 1/(r_max - r_min), and b = r_min/(r_min - r_max)
# # for the special case where r_min == r_max, return 0.5

# def reward_transform(reward, r_min, r_max):
#     # special case:
#     if r_min == r_max:
#         return 0.5
#     a = 1/(r_max - r_min)
#     b = r_min/(r_min - r_max)
#     return reward * a + b

# df_tmp = add_column(df_tmp, 'reward_normalized', ['reward', 'r_min', 'r_max'],
#                    reward_transform)

In [None]:
# df_tmp['reward_normalized'].describe()

In [None]:
df_reward_diff = []

for example_id in df_reward['example_id'].unique():
#     print(example_id)
    df_tmp = df_reward[df_reward['example_id'] == example_id]
    episode_min = df_tmp['episode'].min()
    episode_max = df_tmp['episode'].max()
    reward_at_ep_min = df_tmp[df_tmp['episode'] == episode_min].iloc[0]['reward']
    reward_at_ep_max = df_tmp[df_tmp['episode'] == episode_max].iloc[0]['reward']
#     print(episode_min, reward_at_ep_min)
#     print(episode_max, reward_at_ep_max)
    df_reward_diff.append({
        'example_id': example_id,
        'episode_min': episode_min,
        'episode_max': episode_max,
        'reward_at_ep_min': reward_at_ep_min,
        'reward_at_ep_max': reward_at_ep_max,
        'reward_increase': reward_at_ep_max - reward_at_ep_min,
    })
df_reward_diff = pd.DataFrame(df_reward_diff)

In [None]:
px.line(df_reward,
        x='episode', y='reward', color='example_id')

In [None]:
df_reward_diff

In [None]:
px.bar(df_reward_diff.sort_values(by=['reward_increase']).reset_index(drop=True),
       y='reward_increase', hover_data=['example_id', 'episode_min', 'episode_max',
                                                       'reward_at_ep_min', 'reward_at_ep_max'])