# TD learning; the agent that can see

# Remember to check the number of samples for alpha and beta

now I'm gonna add numbers to the model. 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import confusion_matrix
# np.random.seed(42)
from joblib import Parallel, delayed
import matplotlib.tri as tri
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
from scipy.interpolate import RBFInterpolator
import matplotlib.ticker as mticker
import itertools
from sklearn.metrics import r2_score
import optuna

# important directories

In [2]:
folder_path = 'data_risk_added_epileptic'


output_dir_model_evaluation = "13_RL_agent_TDlearn_output"
os.makedirs(output_dir_model_evaluation, exist_ok=True)

output_dir_plots = os.path.join(output_dir_model_evaluation, "plots")
os.makedirs(output_dir_plots, exist_ok=True)

output_dir_model_behavior = os.path.join(output_dir_model_evaluation, "model_behavior")
os.makedirs(output_dir_model_behavior, exist_ok=True)

In [3]:

dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.csv')]


n_participant = len(dataframes)
print(f"there are {n_participant} participants.")


dataframes[2]

there are 8 participants.


Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,is_within_IQR,risk
0,2135,uniform,835,win,4,7,14079,10.5,0,response,arrowdown,1,0,0,0.375
1,1203,uniform,926,win,9,4,1804,11,1,response,arrowup,1,0,1,0.000
2,1035,uniform,934,win,1,9,766,11.5,2,response,arrowdown,1,0,1,0.000
3,827,uniform,991,lose,2,1,1189,11,3,response,arrowdown,1,0,1,0.125
4,1306,uniform,970,win,9,3,1887,11.5,4,response,arrowup,1,0,1,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,939,uniform,773,win,4,6,802,87,24,response,arrowdown,4,0,1,0.375
269,1210,high,811,win,6,8,1345,87.5,122,response,arrowdown,4,0,1,0.385
270,275,high,838,win,3,5,254,88,123,response,arrowdown,4,0,1,0.071
271,907,uniform,992,win,2,4,774,88.5,14,response,arrowdown,4,0,1,0.125


### I want to make participant file name for the model_evaluation.csv and that is I'm gonna take each data name task_data_07_11_2024_17_23_43.xlsx and extract "07_11_2024_17_23_43" and this should be the participant name in the dataset. 

In [4]:
participants = [os.path.splitext(file)[0].replace("task_data_", "")
    for file in os.listdir(folder_path) if file.endswith('.csv')]

# policy initilization for the model
now I need to find the prior policy amounts. for that I am going to put the percentage of downarrow and up arrow for each distribution.

In [5]:
df_combined = pd.concat(dataframes, ignore_index=True)

df_combined = df_combined[df_combined['outcome'].str.lower() != 'na'].reset_index(drop=True)
 

desired_order = ["uniform", "low", "high"]  


cards_sorted = sorted(df_combined["myCard"].unique())
dist_sorted = [d for d in desired_order if d in df_combined["distribution"].unique()]
choice_sorted = sorted(df_combined["choice"].unique())


card_idx = {card: i for i, card in enumerate(cards_sorted)}
dist_idx = {dist: i for i, dist in enumerate(dist_sorted)}
choice_idx = {choice: i for i, choice in enumerate(choice_sorted)}


matrix_3d = np.zeros((len(cards_sorted), len(dist_sorted), len(choice_sorted)))


for _, row in df_combined.iterrows():
    i = card_idx[row["myCard"]]-1
    j = dist_idx[row["distribution"]]
    k = choice_idx[row["choice"]]
    matrix_3d[i, j, k] += 1  


total_per_card_dist = matrix_3d.sum(axis=2, keepdims=True)

# compute percentages, avoiding division by zero
with np.errstate(divide='ignore', invalid='ignore'):
    percentage_matrix = np.divide(matrix_3d, total_per_card_dist, where=total_per_card_dist != 0)

# convert to a DataFrame for easy visualization
percentage_list = []
for i, card in enumerate(cards_sorted):
    for j, dist in enumerate(dist_sorted):
        for k, choice in enumerate(choice_sorted):
            percentage_list.append({
                "myCard": card,
                "distribution": dist,  # Now follows "uniform", "low", "high" order
                "choice": choice,
                "percentage": percentage_matrix[i, j, k]
            })

df_percentages = pd.DataFrame(percentage_list)
df_percentages

Unnamed: 0,myCard,distribution,choice,percentage
0,1,uniform,arrowdown,0.938462
1,1,uniform,arrowup,0.061538
2,1,low,arrowdown,0.857143
3,1,low,arrowup,0.142857
4,1,high,arrowdown,0.896552
5,1,high,arrowup,0.103448
6,2,uniform,arrowdown,0.915663
7,2,uniform,arrowup,0.084337
8,2,low,arrowdown,0.780488
9,2,low,arrowup,0.219512


In [6]:
np.shape(percentage_matrix)

(9, 3, 2)

In [7]:
actions = { "arrowdown": 0, "arrowup": 1}
distributions_map = { "uniform": 0, "low": 1,  "high": 2}
card_numbers = list(range(1, 10))

policy_table = percentage_matrix 

Q_table_init = np.random.normal(0, 0.01, (len(card_numbers), len(distributions_map), len(actions)))
# having a q-table based on the policies
# Q_table_init = policy_table * np.mean(Q_table_init) 
Q_table = Q_table_init.copy()

#############################################################################################
# having a q-table that starts with 0! this was not a good initilization so i changed it.
# Q_table = np.zeros((len(distributions_map), len(actions)))  # 3 distributions × 2 actions
#############################################################################################

# print("policy: \n",np.shape(policy_table))
print("\n Q_table: \n",np.shape(Q_table))




 Q_table: 
 (9, 3, 2)


In [8]:
def epsilon_greedy(Q_values, epsilon):    
    probs = np.full_like(Q_values, epsilon / Q_values.shape[-1], dtype=float)  # initialize with exploration probability
    best_actions = np.argmax(Q_values, axis=-1)  # find the best action for each state
    np.put_along_axis(probs, np.expand_dims(best_actions, axis=-1), 1 - epsilon + (epsilon / Q_values.shape[-1]), axis=-1)
    return probs



def train_rescorla_wagner(df, alpha, beta, Q_init=None):
    if Q_init is None:
        Q_init = Q_table.copy()
    Q_values = Q_init.copy()
    q_value_pairs = []
    choices = []
    predicted_probs = []
    distributions = []
    card_numbers = []
    
    for _, row in df.iterrows():
        action = actions[row["choice"]] 
        distribution = distributions_map[row["distribution"]] 
        card_number = row["myCard"]-1 # since I'm using this as an index! I need to do -1 to make the 1 to 9 cards come to 0 to 8
        reward = 0.5 if row["outcome"] == "win" else -0.5


        probs = epsilon_greedy(Q_values, beta)
        predicted_probs.append(probs[card_number][distribution][action])
        
        prediction_error = reward - Q_values[card_number][distribution][action]
        Q_values[card_number][distribution][action] += alpha * prediction_error
        
        q_value_pairs.append(Q_values.copy())
        choices.append(action)
        distributions.append(distribution)
        card_numbers.append(card_number)
        

    return np.array(q_value_pairs), np.array(choices), np.array(predicted_probs), np.array(distributions), np.array(card_numbers)


# this is for the sake of parallel computing
def compute_log_likelihood(alpha, beta, df_all, Q_table):
    Q_init_participant = Q_table.copy()
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, alpha, beta, Q_init=Q_init_participant.copy())
    
    predicted_probs = np.clip(predicted_probs, 1e-6, 1)  # prevent log(0)
    log_likelihood = np.sum(np.log(predicted_probs))
    
    return (alpha, beta, log_likelihood)


In [9]:
num_of_samples = 100
# num_of_samples = 1000
alpha_min = 0.01
alpha_max = 1
beta_min = 0.01
beta_max  = 1
alpha_samples = np.random.uniform(alpha_min, alpha_max + np.finfo(float).eps, num_of_samples)
beta_samples = np.random.uniform(beta_min, beta_max + np.finfo(float).eps, num_of_samples)

In [10]:
BIC_models = []
AIC_models = []
best_alpha_models = []
best_beta_models = []
accuracy_models = []
precision_models = []
sensitivity_recall_models = []
specificity_models = []
f1_score_models = []
mcFadden_r2_models = []
r2_models = []

for idx, df_all in enumerate(dataframes):
    Q_init_participant = Q_table.copy()
    
    df_all = df_all[df_all['outcome'].str.lower() != 'na'].reset_index(drop=True)


    def objective(trial):
        alpha = trial.suggest_float("alpha", alpha_min, alpha_max)
        beta  = trial.suggest_float("beta", beta_min, beta_max)

        # negative log-likelihood (Optuna minimises)
        _, _, ll = compute_log_likelihood(alpha, beta,
                                        df_all,
                                        Q_init_participant.copy())
        return -ll

    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=400, n_jobs=-1)




    best_alpha = study.best_params["alpha"]
    best_beta  = study.best_params["beta"]
    best_log_likelihood = -study.best_value

    # keep this for plotting later
    results_df = study.trials_dataframe()
    results_df["alpha"] = results_df["params_alpha"]
    results_df["beta"]  = results_df["params_beta"]
    results_df["log_likelihood"] = -results_df["value"]

    # model prediction 
    
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, best_alpha, best_beta, Q_init=Q_init_participant.copy())
    
    
    predicted_choices = []
    for trial in range(len(card_numbers)):
        test_action_probs = epsilon_greedy(q_values[trial], best_beta)
        p_arrowup = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowup"]]
        p_arrow_down = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowdown"]]
        # choosing 1 or 0 based on the softmax probabilities:
        predicted_choices.append(np.random.choice([1, 0], p=[p_arrowup, p_arrow_down]))

    # finding out model total reward based on the model's predicted choices
    total_reward = [] 
    for i in range(len(predicted_choices)):
        if len(total_reward)> 0:
            last_reward = total_reward[-1]  #  the last reward value
        else:
            last_reward = 10 # initial reward is $10
        
        if ((df_all.loc[i, 'myCard'] > df_all.loc[i, 'yourCard'] and predicted_choices[i] == 1) or
            (df_all.loc[i, 'myCard'] < df_all.loc[i, 'yourCard'] and predicted_choices[i] == 0)):
            total_reward.append(last_reward + 0.5)
        else:
            total_reward.append(last_reward - 0.5)

    
   
       # confusion matrix:
    conf_matrix = confusion_matrix(choices, predicted_choices)
    TN, FP, FN, TP = conf_matrix.ravel()  # unpacking the confusion matrix
    # acc
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision: From the ones that we’ve announced them as up/down, which ones are really up/down?
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall or sensitivity : true positive rate
    sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # specificity : true negative rate
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    # f1 Score
    f1_score = 2 * (precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0

    
    # bayes information criterion:
    n_trials = len(df_all)
    k = 2  # number of free parameters: alpha and beta
    BIC = k * np.log(n_trials) - 2 * best_log_likelihood # this is BIC formula based on the log lkelihode I found before

    

    # Akaike  information criterion(AIC):
    AIC = 2 * k - 2 * best_log_likelihood 


    # mcFadden r-squared:
    p_null = np.mean(choices)  # probability of choosing "1" in the dataset
    log_likelihood_null = np.sum(choices * np.log(p_null) + (1 - choices) * np.log(1 - p_null))
    mcFadden_r2 = 1 - (best_log_likelihood / log_likelihood_null)

    # r-squared
    r2 = r2_score(choices, predicted_choices)
    
    # saving models evaluation variables:
    best_alpha_models.append(best_alpha)
    best_beta_models.append(best_beta)
    BIC_models.append(BIC)
    AIC_models.append(AIC)
    accuracy_models.append(accuracy)
    precision_models.append(precision)
    sensitivity_recall_models.append(sensitivity_recall)
    specificity_models.append(specificity)
    f1_score_models.append(f1_score)
    mcFadden_r2_models.append(mcFadden_r2)
    r2_models.append(r2)


    ###########################################################################################
    ## visulization
    ###########################################################################################

    fig, axes = plt.subplots(1, 3, figsize=(19, 6))

    plots_smooth_level = 20


#############################################
    # Density Plot (KDE)
    sns.kdeplot(
        x=results_df["alpha"], 
        y=results_df["beta"], 
        fill=True, 
        cmap="viridis", 
        ax=axes[0], 
        bw_adjust=1.8,  # Increase for smoother density
        levels=plots_smooth_level,  # More contour levels
        thresh=0  # Ensure density is plotted across all values
    )
    mappable = axes[0].collections[0]
    cbar = fig.colorbar(mappable, ax=axes[0], label="density", fraction=0.046, pad=0.04)  
    cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))  # 2 decimal places
    cbar.ax.set_ylabel("density", fontsize=12, fontweight='bold')
    cbar.ax.tick_params(labelsize=12)

    axes[0].set_xlim(alpha_min, alpha_max)
    axes[0].set_ylim(beta_min, beta_max)
    axes[0].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[0].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[0].set_title("density of α and ε joint probability", fontsize=16, fontweight='bold')
    axes[0].tick_params(axis='both', labelsize=14)
#############################################


#############################################
    # Log Likelihood 

    
    alpha_step = 0.1
    beta_step = 0.1
    alpha_bins = np.arange(alpha_min, alpha_max+ alpha_step, alpha_step)  # bins from 0 to 1 with step 0.1
    beta_bins = np.arange(beta_min, beta_max + beta_step, beta_step)       # bins from 0 to 1 with step 0.1

    results_df["alpha_binned"] = pd.cut(results_df["alpha"], bins=alpha_bins, labels=alpha_bins[:-1], include_lowest=True)
    results_df["beta_binned"] = pd.cut(results_df["beta"], bins=beta_bins, labels=beta_bins[:-1], include_lowest=True)

    heatmap_data = results_df.groupby(
    ["beta_binned", "alpha_binned"], observed=False)["log_likelihood"].mean().unstack()

    heatmap_data.index = heatmap_data.index.astype(float)
    heatmap_data.columns = heatmap_data.columns.astype(float)

    sns.heatmap(
        heatmap_data, 
        cmap="Blues", 
        cbar=True,
        ax=axes[1]
    )
    axes[1].set_xticks(np.arange(len(heatmap_data.columns)))  
    axes[1].set_xticklabels([f"{x:.1f}" for x in heatmap_data.columns], rotation=45)

    axes[1].set_yticks(np.arange(len(heatmap_data.index))) 
    axes[1].set_yticklabels([f"{x:.1f}" for x in heatmap_data.index]) 


    axes[1].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[1].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[1].set_title("log likelihood for combinations of α and ε", fontsize=16, fontweight='bold')
    axes[1].tick_params(axis='both', labelsize=14)
    axes[1].invert_yaxis()


    

#############################################


#############################################
    # Confusion Matrix
    heatmap_cmap_color = mcolors.LinearSegmentedColormap.from_list("warm_red", ["#fff5e6", "#ff5733"])
    sns.heatmap(
        conf_matrix, annot=True, fmt="d", cmap=heatmap_cmap_color,
        xticklabels=["arrowdown", "arrowup"], 
        yticklabels=["arrowdown", "arrowup"], 
        ax=axes[2], 
        cbar=False
    )

    axes[2].set_xlabel("prediction", fontsize=14, fontweight='bold')
    axes[2].set_ylabel("true label", fontsize=14, fontweight='bold')
    axes[2].set_title(f"confusion matrix (α={best_alpha:.2f}, ε={best_beta:.2f})", fontsize=16, fontweight='bold')
    axes[2].tick_params(axis='both', labelsize=14)


#############################################
    # saving figures
    plt.tight_layout(rect=[0, 0, 1, 0.9]) 
    fig.suptitle(f'participant {idx}', fontsize=18, fontweight='bold', y=0.95)

    filename = os.path.join(output_dir_plots, f"plot_{participants[idx]}.pdf")
    plt.savefig(filename, format='pdf')
    plt.close(fig)

    print(f"saved: {filename}")


#############################################

    # saving model behavior
    q_values_reshaped = [q_values[i].tolist() for i in range(n_trials)]  # convert each (9,3,2) array into a list format

    df_model_behavior = pd.DataFrame({
        "model_choices": predicted_choices,
        "participant_choices": choices,
        "model_total_reward": total_reward,
        "participant_total_reward": df_all["totalReward"],
        "q_val": q_values_reshaped  
    })

    file_path = os.path.join(output_dir_model_behavior, f"model_behavior_{participants[idx]}.csv")
    df_model_behavior.to_csv(file_path, index=False)



[I 2025-06-24 12:54:44,325] A new study created in memory with name: no-name-80bb841e-772c-4a47-9bc6-026c3f7cc51b
[I 2025-06-24 12:54:45,516] Trial 25 finished with value: 156.5763583477468 and parameters: {'alpha': 0.7229321342047036, 'beta': 0.6903651464936165}. Best is trial 25 with value: 156.5763583477468.
[I 2025-06-24 12:54:45,519] Trial 11 finished with value: 180.8748091122764 and parameters: {'alpha': 0.7229431437370901, 'beta': 0.9523411086550897}. Best is trial 25 with value: 156.5763583477468.
[I 2025-06-24 12:54:45,536] Trial 26 finished with value: 170.3402512080379 and parameters: {'alpha': 0.04232135917740773, 'beta': 0.8720766548662316}. Best is trial 25 with value: 156.5763583477468.
[I 2025-06-24 12:54:46,022] Trial 12 finished with value: 152.21071031923668 and parameters: {'alpha': 0.8071909366203908, 'beta': 0.5996303167403625}. Best is trial 12 with value: 152.21071031923668.
[I 2025-06-24 12:54:46,030] Trial 10 finished with value: 161.3313032984568 and paramet

saved: 13_RL_agent_TDlearn_output\plots\plot_02_01_2025_13_21_03.pdf


[I 2025-06-24 12:55:17,488] Trial 1 finished with value: 175.8511842566822 and parameters: {'alpha': 0.3407803654263189, 'beta': 0.885860341463248}. Best is trial 1 with value: 175.8511842566822.
[I 2025-06-24 12:55:17,594] Trial 2 finished with value: 177.44303332670904 and parameters: {'alpha': 0.20763237769568646, 'beta': 0.30000949399331694}. Best is trial 1 with value: 175.8511842566822.
[I 2025-06-24 12:55:17,809] Trial 0 finished with value: 161.40634685502226 and parameters: {'alpha': 0.8372668201354165, 'beta': 0.5756752232252773}. Best is trial 0 with value: 161.40634685502226.
[I 2025-06-24 12:55:17,928] Trial 3 finished with value: 182.17317301194709 and parameters: {'alpha': 0.7852042711432592, 'beta': 0.9547434314018411}. Best is trial 0 with value: 161.40634685502226.
[I 2025-06-24 12:55:18,212] Trial 4 finished with value: 182.02684105237392 and parameters: {'alpha': 0.7579565884569102, 'beta': 0.2708902818372171}. Best is trial 0 with value: 161.40634685502226.
[I 2025

saved: 13_RL_agent_TDlearn_output\plots\plot_03_02_2025_15_29_50.pdf


[I 2025-06-24 12:55:48,532] Trial 2 finished with value: 186.20745299266562 and parameters: {'alpha': 0.07541759369574812, 'beta': 0.0501651654260041}. Best is trial 2 with value: 186.20745299266562.
[I 2025-06-24 12:55:48,704] Trial 0 finished with value: 165.90329971499784 and parameters: {'alpha': 0.7954381765138321, 'beta': 0.813677641219029}. Best is trial 0 with value: 165.90329971499784.
[I 2025-06-24 12:55:48,739] Trial 3 finished with value: 143.3264019259867 and parameters: {'alpha': 0.27251752482481706, 'beta': 0.6796232037123524}. Best is trial 3 with value: 143.3264019259867.
[I 2025-06-24 12:55:48,936] Trial 4 finished with value: 138.25968149524803 and parameters: {'alpha': 0.1371722591550527, 'beta': 0.615983364558011}. Best is trial 4 with value: 138.25968149524803.
[I 2025-06-24 12:55:48,956] Trial 5 finished with value: 127.65616101205228 and parameters: {'alpha': 0.19296221407950245, 'beta': 0.28491402290676415}. Best is trial 5 with value: 127.65616101205228.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_03_04_2025_13_57_44.pdf


[I 2025-06-24 12:56:19,781] Trial 3 finished with value: 203.0060830253562 and parameters: {'alpha': 0.613831842364085, 'beta': 0.1341887955238856}. Best is trial 0 with value: 179.59531916384154.
[I 2025-06-24 12:56:19,891] Trial 4 finished with value: 170.5006090616492 and parameters: {'alpha': 0.46619902670222974, 'beta': 0.8524408364199517}. Best is trial 4 with value: 170.5006090616492.
[I 2025-06-24 12:56:19,925] Trial 6 finished with value: 262.66086229716797 and parameters: {'alpha': 0.09493371866429645, 'beta': 0.03977685786804704}. Best is trial 4 with value: 170.5006090616492.
[I 2025-06-24 12:56:20,144] Trial 5 finished with value: 170.31726551404483 and parameters: {'alpha': 0.9778325764933787, 'beta': 0.8471888402039796}. Best is trial 5 with value: 170.31726551404483.
[I 2025-06-24 12:56:20,955] Trial 1 finished with value: 236.19233407063177 and parameters: {'alpha': 0.3680944723273248, 'beta': 0.07264655648862713}. Best is trial 5 with value: 170.31726551404483.
[I 202

saved: 13_RL_agent_TDlearn_output\plots\plot_07_12_2024_13_02_50.pdf


[I 2025-06-24 12:56:51,648] Trial 4 finished with value: 155.02857007845583 and parameters: {'alpha': 0.6250334045290633, 'beta': 0.5737088043046741}. Best is trial 4 with value: 155.02857007845583.
[I 2025-06-24 12:56:52,047] Trial 6 finished with value: 154.7953927849236 and parameters: {'alpha': 0.3849685978377687, 'beta': 0.5590523690588098}. Best is trial 6 with value: 154.7953927849236.
[I 2025-06-24 12:56:52,590] Trial 9 finished with value: 161.99414343231075 and parameters: {'alpha': 0.9252230907226153, 'beta': 0.7398345520920672}. Best is trial 6 with value: 154.7953927849236.
[I 2025-06-24 12:56:52,713] Trial 8 finished with value: 154.55012835794759 and parameters: {'alpha': 0.7635952698245467, 'beta': 0.5325779211339432}. Best is trial 8 with value: 154.55012835794759.
[I 2025-06-24 12:56:52,947] Trial 7 finished with value: 154.8122622739579 and parameters: {'alpha': 0.49454284019744194, 'beta': 0.47816947547408517}. Best is trial 8 with value: 154.55012835794759.
[I 2025

saved: 13_RL_agent_TDlearn_output\plots\plot_14_03_2025_16_05_47.pdf


[I 2025-06-24 12:57:19,744] Trial 3 finished with value: 172.452393887965 and parameters: {'alpha': 0.6573224170957224, 'beta': 0.2854778267226484}. Best is trial 3 with value: 172.452393887965.
[I 2025-06-24 12:57:19,824] Trial 15 finished with value: 204.23379886497142 and parameters: {'alpha': 0.30866571037445195, 'beta': 0.12510586954303365}. Best is trial 3 with value: 172.452393887965.
[I 2025-06-24 12:57:19,858] Trial 10 finished with value: 164.40800460849067 and parameters: {'alpha': 0.9783721949732439, 'beta': 0.7535695557032683}. Best is trial 10 with value: 164.40800460849067.
[I 2025-06-24 12:57:19,881] Trial 1 finished with value: 177.60193902840797 and parameters: {'alpha': 0.8395845209944267, 'beta': 0.9153808076362945}. Best is trial 10 with value: 164.40800460849067.
[I 2025-06-24 12:57:19,900] Trial 0 finished with value: 164.43753490842215 and parameters: {'alpha': 0.10386373510105552, 'beta': 0.3967421836201592}. Best is trial 10 with value: 164.40800460849067.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_20_05_2025_13_31_58.pdf


[I 2025-06-24 12:57:46,310] Trial 6 finished with value: 181.03100273445307 and parameters: {'alpha': 0.6299002134475357, 'beta': 0.9467552827985343}. Best is trial 6 with value: 181.03100273445307.
[I 2025-06-24 12:57:46,338] Trial 0 finished with value: 175.74281356290356 and parameters: {'alpha': 0.8455688316268922, 'beta': 0.27548482940951735}. Best is trial 0 with value: 175.74281356290356.
[I 2025-06-24 12:57:46,354] Trial 4 finished with value: 173.36613314268095 and parameters: {'alpha': 0.12727592643417962, 'beta': 0.2669291607872057}. Best is trial 4 with value: 173.36613314268095.
[I 2025-06-24 12:57:46,362] Trial 9 finished with value: 158.97934897502347 and parameters: {'alpha': 0.6796972268870457, 'beta': 0.499547643968891}. Best is trial 9 with value: 158.97934897502347.
[I 2025-06-24 12:57:46,364] Trial 3 finished with value: 186.44554372305478 and parameters: {'alpha': 0.9846038778699858, 'beta': 0.21882664373658645}. Best is trial 9 with value: 158.97934897502347.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_21_06_2025_11_21_39.pdf


[I 2025-06-24 12:58:12,682] Trial 21 finished with value: 242.76219431033098 and parameters: {'alpha': 0.04824203415209573, 'beta': 0.15385870205781438}. Best is trial 21 with value: 242.76219431033098.
[I 2025-06-24 12:58:12,743] Trial 10 finished with value: 167.73747946080942 and parameters: {'alpha': 0.18121825406599412, 'beta': 0.5005938005298929}. Best is trial 10 with value: 167.73747946080942.
[I 2025-06-24 12:58:12,750] Trial 2 finished with value: 312.06321338977784 and parameters: {'alpha': 0.2747497327917153, 'beta': 0.04926934071425857}. Best is trial 10 with value: 167.73747946080942.
[I 2025-06-24 12:58:12,751] Trial 15 finished with value: 350.6397125611068 and parameters: {'alpha': 0.5673407840338631, 'beta': 0.02433769654411076}. Best is trial 10 with value: 167.73747946080942.
[I 2025-06-24 12:58:12,755] Trial 1 finished with value: 163.2210777324201 and parameters: {'alpha': 0.8861308827932083, 'beta': 0.5747872462585895}. Best is trial 1 with value: 163.22107773242

saved: 13_RL_agent_TDlearn_output\plots\plot_28_02_2025_12_02_47.pdf


# now saving the model evaluation values

In [11]:
df_models_evaluation = pd.DataFrame({
    "participants": participants,
    "best_alpha": best_alpha_models,
    "best_beta": best_beta_models,
    "BIC": BIC_models,
    "AIC": AIC_models,
    "accuracy": accuracy_models,
    "precision": precision_models,
    "sensitivity_recall": sensitivity_recall_models,
    "specificity": specificity_models,
    "f1_score": f1_score_models,
    "mcFadden_r2": mcFadden_r2_models,
    "r2": r2_models
})

file_path = os.path.join(output_dir_model_evaluation, "models_evaluation.csv")
df_models_evaluation.to_csv(file_path, index=False)