In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from collections import defaultdict
plt.rcParams['mathtext.fontset'] = 'cm'
plt.rcParams['font.family'] = 'STIXGeneral'
%config InlineBackend.figure_formats = ['svg']

project_dir = '../'

In [3]:
nS = 750
nA = 25

nS_term = 2
S_survival = 750
S_death = 751

nS_total = nS + nS_term

In [4]:
# create 3 reward matrices
R_mixed = np.zeros((nS_total, nA, nS_total))
R_mixed[:, :, S_survival] = 1
R_mixed[:, :, S_death] = -1

R_plus = np.zeros((nS_total, nA, nS_total))
R_plus[:, :, S_survival] = 1
R_plus[:, :, S_death] = 0

R_minus = np.zeros((nS_total, nA, nS_total))
R_minus[:, :, S_survival] = 0
R_minus[:, :, S_death] = -1

In [None]:
def make_transition_matrix(df_data):
    """
    Create the empirical transition matrix from the dataset.
    """
    # count occurrences of each transition
    SAS_count = df_data.groupby(['s:state', 'a:action', 's:next_state']).size().reset_index(name='count')

    # Create the transition matrix
    P = np.full((nS_total, nA, nS_total), np.nan)
    for _, row in SAS_count.iterrows():
        P[row['s:state'], row['a:action'], row['s:next_state']] = row['count']

    # Normalize the transition matrix
    P = P / np.nansum(P, axis=2, keepdims=True)

    # Set the transition probabilities for terminal states
    P[S_survival, :, :] = 0
    P[S_survival, :, S_survival] = 1
    P[S_death, :, :] = 0
    P[S_death, :, S_death] = 1

    return P

def make_gymP(P, R):
    """
    Convert the transition and reward matrices to the gym format.
    """
    gymP = defaultdict(lambda: defaultdict(list))
    for s in range(nS):
        for a in range(nA):
            for s_ in range(nS_total):
                if not np.isnan(P[s, a, s_]):
                    prob = P[s, a, s_]
                    reward = R[s, a, s_]
                    done = int(s_ in [S_survival, S_death])
                    gymP[s][a].append((prob, s_, reward, done))
    return gymP

In [6]:
df_train_shifted = pd.read_csv(
    project_dir + 'data/traj_shifted_train.csv',
    dtype={"a:action": "Int64", "a:next_action": "Int64"}
)

P_shifted = make_transition_matrix(df_train_shifted)

variants = {
    "mixed": R_mixed,
    "plus": R_plus,
    "minus": R_minus
}

for name, R in variants.items():
    gymP = make_gymP(P_shifted, R)
    
    # save gymP
    with open(project_dir + f"data/env_model/gymP_shifted_{name}.pkl", "wb") as f:
        pickle.dump(dict(gymP), f)
    
    # save reward matrix too (optional, but nice to keep)
    np.save(project_dir + f"data/env_model/R_shifted_{name}.npy", R)
    
    print(f"Saved gymP and R for {name}")



Saved gymP and R for mixed
Saved gymP and R for plus
Saved gymP and R for minus
