# TD learning; the agent that can see

# Remember to check the number of samples for alpha and beta

now I'm gonna add numbers to the model. 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import confusion_matrix
# np.random.seed(42)
from joblib import Parallel, delayed
import matplotlib.tri as tri
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
from scipy.interpolate import RBFInterpolator
import matplotlib.ticker as mticker
import itertools
from sklearn.metrics import r2_score
import optuna

# important directories

In [2]:
folder_path = 'data_risk_added'


output_dir_model_evaluation = "13_RL_agent_TDlearn_output"
os.makedirs(output_dir_model_evaluation, exist_ok=True)

output_dir_plots = os.path.join(output_dir_model_evaluation, "plots")
os.makedirs(output_dir_plots, exist_ok=True)

output_dir_model_behavior = os.path.join(output_dir_model_evaluation, "model_behavior")
os.makedirs(output_dir_model_behavior, exist_ok=True)

In [3]:

dataframes = [pd.read_excel(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.xlsx')]


n_participant = len(dataframes)
print(f"there are {n_participant} participants.")


dataframes[2]

there are 38 participants.


Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,is_within_IQR,risk
0,2609,uniform,789,lose,4,2,335,9.5,0,response,arrowdown,1,0,1,0.375
1,597,uniform,853,win,9,4,407,10,1,response,arrowup,1,0,1,0.000
2,188,uniform,904,win,4,7,504,10.5,2,response,arrowdown,1,0,1,0.375
3,423,uniform,916,win,2,4,434,11,3,response,arrowdown,1,0,1,0.125
4,549,uniform,806,win,5,7,287,11.5,4,response,arrowdown,1,0,1,0.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,761,high,913,win,7,8,382,80,125,response,arrowdown,4,0,1,0.447
273,596,low,921,win,4,3,318,80.5,83,response,arrowup,4,0,1,0.385
274,414,low,950,win,2,7,335,81,77,response,arrowdown,4,0,1,0.243
275,1371,uniform,842,win,6,4,615,81.5,35,response,arrowup,4,0,1,0.375


### I want to make participant file name for the model_evaluation.csv and that is I'm gonna take each data name task_data_07_11_2024_17_23_43.xlsx and extract "07_11_2024_17_23_43" and this should be the participant name in the dataset. 

In [4]:
participants = [os.path.splitext(file)[0].replace("task_data_", "")
    for file in os.listdir(folder_path) if file.endswith('.xlsx')]

# policy initilization for the model
now I need to find the prior policy amounts. for that I am going to put the percentage of downarrow and up arrow for each distribution.

In [5]:
df_combined = pd.concat(dataframes, ignore_index=True)

df_combined = df_combined[df_combined['outcome'].str.lower() != 'na'].reset_index(drop=True)
 

desired_order = ["uniform", "low", "high"]  


cards_sorted = sorted(df_combined["myCard"].unique())
dist_sorted = [d for d in desired_order if d in df_combined["distribution"].unique()]
choice_sorted = sorted(df_combined["choice"].unique())


card_idx = {card: i for i, card in enumerate(cards_sorted)}
dist_idx = {dist: i for i, dist in enumerate(dist_sorted)}
choice_idx = {choice: i for i, choice in enumerate(choice_sorted)}


matrix_3d = np.zeros((len(cards_sorted), len(dist_sorted), len(choice_sorted)))


for _, row in df_combined.iterrows():
    i = card_idx[row["myCard"]]-1
    j = dist_idx[row["distribution"]]
    k = choice_idx[row["choice"]]
    matrix_3d[i, j, k] += 1  


total_per_card_dist = matrix_3d.sum(axis=2, keepdims=True)

# compute percentages, avoiding division by zero
with np.errstate(divide='ignore', invalid='ignore'):
    percentage_matrix = np.divide(matrix_3d, total_per_card_dist, where=total_per_card_dist != 0)

# convert to a DataFrame for easy visualization
percentage_list = []
for i, card in enumerate(cards_sorted):
    for j, dist in enumerate(dist_sorted):
        for k, choice in enumerate(choice_sorted):
            percentage_list.append({
                "myCard": card,
                "distribution": dist,  # Now follows "uniform", "low", "high" order
                "choice": choice,
                "percentage": percentage_matrix[i, j, k]
            })

df_percentages = pd.DataFrame(percentage_list)
df_percentages

Unnamed: 0,myCard,distribution,choice,percentage
0,1,uniform,arrowdown,0.968338
1,1,uniform,arrowup,0.031662
2,1,low,arrowdown,0.922456
3,1,low,arrowup,0.077544
4,1,high,arrowdown,0.962025
5,1,high,arrowup,0.037975
6,2,uniform,arrowdown,0.968
7,2,uniform,arrowup,0.032
8,2,low,arrowdown,0.742115
9,2,low,arrowup,0.257885


In [6]:
np.shape(percentage_matrix)

(9, 3, 2)

In [7]:
actions = { "arrowdown": 0, "arrowup": 1}
distributions_map = { "uniform": 0, "low": 1,  "high": 2}
card_numbers = list(range(1, 10))

policy_table = percentage_matrix 

Q_table_init = np.random.normal(0, 0.01, (len(card_numbers), len(distributions_map), len(actions)))
# having a q-table based on the policies
# Q_table_init = policy_table * np.mean(Q_table_init) 
Q_table = Q_table_init.copy()

#############################################################################################
# having a q-table that starts with 0! this was not a good initilization so i changed it.
# Q_table = np.zeros((len(distributions_map), len(actions)))  # 3 distributions × 2 actions
#############################################################################################

# print("policy: \n",np.shape(policy_table))
print("\n Q_table: \n",np.shape(Q_table))




 Q_table: 
 (9, 3, 2)


In [8]:
def epsilon_greedy(Q_values, epsilon):    
    probs = np.full_like(Q_values, epsilon / Q_values.shape[-1], dtype=float)  # initialize with exploration probability
    best_actions = np.argmax(Q_values, axis=-1)  # find the best action for each state
    np.put_along_axis(probs, np.expand_dims(best_actions, axis=-1), 1 - epsilon + (epsilon / Q_values.shape[-1]), axis=-1)
    return probs



def train_rescorla_wagner(df, alpha, beta, Q_init=None):
    if Q_init is None:
        Q_init = Q_table.copy()
    Q_values = Q_init.copy()
    q_value_pairs = []
    choices = []
    predicted_probs = []
    distributions = []
    card_numbers = []
    
    for _, row in df.iterrows():
        action = actions[row["choice"]] 
        distribution = distributions_map[row["distribution"]] 
        card_number = row["myCard"]-1 # since I'm using this as an index! I need to do -1 to make the 1 to 9 cards come to 0 to 8
        reward = 0.5 if row["outcome"] == "win" else -0.5


        probs = epsilon_greedy(Q_values, beta)
        predicted_probs.append(probs[card_number][distribution][action])
        
        prediction_error = reward - Q_values[card_number][distribution][action]
        Q_values[card_number][distribution][action] += alpha * prediction_error
        
        q_value_pairs.append(Q_values.copy())
        choices.append(action)
        distributions.append(distribution)
        card_numbers.append(card_number)
        

    return np.array(q_value_pairs), np.array(choices), np.array(predicted_probs), np.array(distributions), np.array(card_numbers)


# this is for the sake of parallel computing
def compute_log_likelihood(alpha, beta, df_all, Q_table):
    Q_init_participant = Q_table.copy()
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, alpha, beta, Q_init=Q_init_participant.copy())
    
    predicted_probs = np.clip(predicted_probs, 1e-6, 1)  # prevent log(0)
    log_likelihood = np.sum(np.log(predicted_probs))
    
    return (alpha, beta, log_likelihood)


In [9]:
num_of_samples = 100
# num_of_samples = 1000
alpha_min = 0.01
alpha_max = 1
beta_min = 0.01
beta_max  = 1
alpha_samples = np.random.uniform(alpha_min, alpha_max + np.finfo(float).eps, num_of_samples)
beta_samples = np.random.uniform(beta_min, beta_max + np.finfo(float).eps, num_of_samples)

In [10]:
BIC_models = []
AIC_models = []
best_alpha_models = []
best_beta_models = []
accuracy_models = []
precision_models = []
sensitivity_recall_models = []
specificity_models = []
f1_score_models = []
mcFadden_r2_models = []
r2_models = []

for idx, df_all in enumerate(dataframes):
    Q_init_participant = Q_table.copy()
    
    df_all = df_all[df_all['outcome'].str.lower() != 'na'].reset_index(drop=True)


    def objective(trial):
        alpha = trial.suggest_float("alpha", alpha_min, alpha_max)
        beta  = trial.suggest_float("beta", beta_min, beta_max)

        # negative log-likelihood (Optuna minimises)
        _, _, ll = compute_log_likelihood(alpha, beta,
                                        df_all,
                                        Q_init_participant.copy())
        return -ll

    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=400, n_jobs=-1)




    best_alpha = study.best_params["alpha"]
    best_beta  = study.best_params["beta"]
    best_log_likelihood = -study.best_value

    # keep this for plotting later
    results_df = study.trials_dataframe()
    results_df["alpha"] = results_df["params_alpha"]
    results_df["beta"]  = results_df["params_beta"]
    results_df["log_likelihood"] = -results_df["value"]

    # model prediction 
    
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, best_alpha, best_beta, Q_init=Q_init_participant.copy())
    
    
    predicted_choices = []
    for trial in range(len(card_numbers)):
        test_action_probs = epsilon_greedy(q_values[trial], best_beta)
        p_arrowup = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowup"]]
        p_arrow_down = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowdown"]]
        # choosing 1 or 0 based on the softmax probabilities:
        predicted_choices.append(np.random.choice([1, 0], p=[p_arrowup, p_arrow_down]))

    # finding out model total reward based on the model's predicted choices
    total_reward = [] 
    for i in range(len(predicted_choices)):
        if len(total_reward)> 0:
            last_reward = total_reward[-1]  #  the last reward value
        else:
            last_reward = 10 # initial reward is $10
        
        if ((df_all.loc[i, 'myCard'] > df_all.loc[i, 'yourCard'] and predicted_choices[i] == 1) or
            (df_all.loc[i, 'myCard'] < df_all.loc[i, 'yourCard'] and predicted_choices[i] == 0)):
            total_reward.append(last_reward + 0.5)
        else:
            total_reward.append(last_reward - 0.5)

    
   
       # confusion matrix:
    conf_matrix = confusion_matrix(choices, predicted_choices)
    TN, FP, FN, TP = conf_matrix.ravel()  # unpacking the confusion matrix
    # acc
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision: From the ones that we’ve announced them as up/down, which ones are really up/down?
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall or sensitivity : true positive rate
    sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # specificity : true negative rate
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    # f1 Score
    f1_score = 2 * (precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0

    
    # bayes information criterion:
    n_trials = len(df_all)
    k = 2  # number of free parameters: alpha and beta
    BIC = k * np.log(n_trials) - 2 * best_log_likelihood # this is BIC formula based on the log lkelihode I found before

    

    # Akaike  information criterion(AIC):
    AIC = 2 * k - 2 * best_log_likelihood 


    # mcFadden r-squared:
    p_null = np.mean(choices)  # probability of choosing "1" in the dataset
    log_likelihood_null = np.sum(choices * np.log(p_null) + (1 - choices) * np.log(1 - p_null))
    mcFadden_r2 = 1 - (best_log_likelihood / log_likelihood_null)

    # r-squared
    r2 = r2_score(choices, predicted_choices)
    
    # saving models evaluation variables:
    best_alpha_models.append(best_alpha)
    best_beta_models.append(best_beta)
    BIC_models.append(BIC)
    AIC_models.append(AIC)
    accuracy_models.append(accuracy)
    precision_models.append(precision)
    sensitivity_recall_models.append(sensitivity_recall)
    specificity_models.append(specificity)
    f1_score_models.append(f1_score)
    mcFadden_r2_models.append(mcFadden_r2)
    r2_models.append(r2)


    ###########################################################################################
    ## visulization
    ###########################################################################################

    fig, axes = plt.subplots(1, 3, figsize=(19, 6))

    plots_smooth_level = 20


#############################################
    # Density Plot (KDE)
    sns.kdeplot(
        x=results_df["alpha"], 
        y=results_df["beta"], 
        fill=True, 
        cmap="viridis", 
        ax=axes[0], 
        bw_adjust=1.8,  # Increase for smoother density
        levels=plots_smooth_level,  # More contour levels
        thresh=0  # Ensure density is plotted across all values
    )
    mappable = axes[0].collections[0]
    cbar = fig.colorbar(mappable, ax=axes[0], label="density", fraction=0.046, pad=0.04)  
    cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))  # 2 decimal places
    cbar.ax.set_ylabel("density", fontsize=12, fontweight='bold')
    cbar.ax.tick_params(labelsize=12)

    axes[0].set_xlim(alpha_min, alpha_max)
    axes[0].set_ylim(beta_min, beta_max)
    axes[0].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[0].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[0].set_title("density of α and ε joint probability", fontsize=16, fontweight='bold')
    axes[0].tick_params(axis='both', labelsize=14)
#############################################


#############################################
    # Log Likelihood 

    
    alpha_step = 0.1
    beta_step = 0.1
    alpha_bins = np.arange(alpha_min, alpha_max+ alpha_step, alpha_step)  # bins from 0 to 1 with step 0.1
    beta_bins = np.arange(beta_min, beta_max + beta_step, beta_step)       # bins from 0 to 1 with step 0.1

    results_df["alpha_binned"] = pd.cut(results_df["alpha"], bins=alpha_bins, labels=alpha_bins[:-1], include_lowest=True)
    results_df["beta_binned"] = pd.cut(results_df["beta"], bins=beta_bins, labels=beta_bins[:-1], include_lowest=True)

    heatmap_data = results_df.groupby(
    ["beta_binned", "alpha_binned"], observed=False)["log_likelihood"].mean().unstack()

    heatmap_data.index = heatmap_data.index.astype(float)
    heatmap_data.columns = heatmap_data.columns.astype(float)

    sns.heatmap(
        heatmap_data, 
        cmap="Blues", 
        cbar=True,
        ax=axes[1]
    )
    axes[1].set_xticks(np.arange(len(heatmap_data.columns)))  
    axes[1].set_xticklabels([f"{x:.1f}" for x in heatmap_data.columns], rotation=45)

    axes[1].set_yticks(np.arange(len(heatmap_data.index))) 
    axes[1].set_yticklabels([f"{x:.1f}" for x in heatmap_data.index]) 


    axes[1].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[1].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[1].set_title("log likelihood for combinations of α and ε", fontsize=16, fontweight='bold')
    axes[1].tick_params(axis='both', labelsize=14)
    axes[1].invert_yaxis()


    

#############################################


#############################################
    # Confusion Matrix
    heatmap_cmap_color = mcolors.LinearSegmentedColormap.from_list("warm_red", ["#fff5e6", "#ff5733"])
    sns.heatmap(
        conf_matrix, annot=True, fmt="d", cmap=heatmap_cmap_color,
        xticklabels=["arrowdown", "arrowup"], 
        yticklabels=["arrowdown", "arrowup"], 
        ax=axes[2], 
        cbar=False
    )

    axes[2].set_xlabel("prediction", fontsize=14, fontweight='bold')
    axes[2].set_ylabel("true label", fontsize=14, fontweight='bold')
    axes[2].set_title(f"confusion matrix (α={best_alpha:.2f}, ε={best_beta:.2f})", fontsize=16, fontweight='bold')
    axes[2].tick_params(axis='both', labelsize=14)


#############################################
    # saving figures
    plt.tight_layout(rect=[0, 0, 1, 0.9]) 
    fig.suptitle(f'participant {idx}', fontsize=18, fontweight='bold', y=0.95)

    filename = os.path.join(output_dir_plots, f"plot_{participants[idx]}.pdf")
    plt.savefig(filename, format='pdf')
    plt.close(fig)

    print(f"saved: {filename}")


#############################################

    # saving model behavior
    q_values_reshaped = [q_values[i].tolist() for i in range(n_trials)]  # convert each (9,3,2) array into a list format

    df_model_behavior = pd.DataFrame({
        "model_choices": predicted_choices,
        "participant_choices": choices,
        "model_total_reward": total_reward,
        "participant_total_reward": df_all["totalReward"],
        "q_val": q_values_reshaped  
    })

    file_path = os.path.join(output_dir_model_behavior, f"model_behavior_{participants[idx]}.csv")
    df_model_behavior.to_csv(file_path, index=False)



[I 2025-06-13 12:46:26,030] A new study created in memory with name: no-name-b52ed6be-cf21-4972-ae51-8b16a5f9f45f
[I 2025-06-13 12:46:27,391] Trial 5 finished with value: 164.08692236022662 and parameters: {'alpha': 0.608232593540759, 'beta': 0.6003523217679522}. Best is trial 5 with value: 164.08692236022662.
[I 2025-06-13 12:46:27,395] Trial 3 finished with value: 156.9022305888555 and parameters: {'alpha': 0.17792790791659094, 'beta': 0.5774631461047962}. Best is trial 3 with value: 156.9022305888555.
[I 2025-06-13 12:46:27,407] Trial 0 finished with value: 166.69442407468864 and parameters: {'alpha': 0.7036332363622122, 'beta': 0.724424035264312}. Best is trial 3 with value: 156.9022305888555.
[I 2025-06-13 12:46:27,415] Trial 1 finished with value: 177.33547736933107 and parameters: {'alpha': 0.09762476098829072, 'beta': 0.23275971014652488}. Best is trial 3 with value: 156.9022305888555.
[I 2025-06-13 12:46:27,419] Trial 6 finished with value: 380.0335885404398 and parameters: {'

saved: 13_RL_agent_TDlearn_output\plots\plot_06_06_2025_16_43_26.pdf


[I 2025-06-13 12:46:51,966] Trial 5 finished with value: 146.85868999617128 and parameters: {'alpha': 0.36316618120670485, 'beta': 0.3433048631506553}. Best is trial 5 with value: 146.85868999617128.
[I 2025-06-13 12:46:51,979] Trial 1 finished with value: 145.80280001327742 and parameters: {'alpha': 0.35762996303978434, 'beta': 0.36687992150810644}. Best is trial 1 with value: 145.80280001327742.
[I 2025-06-13 12:46:51,994] Trial 8 finished with value: 185.96615197464806 and parameters: {'alpha': 0.7627553359563275, 'beta': 0.9913503064776495}. Best is trial 1 with value: 145.80280001327742.
[I 2025-06-13 12:46:51,998] Trial 9 finished with value: 219.63564380167793 and parameters: {'alpha': 0.49222400759953944, 'beta': 0.07716848472287129}. Best is trial 1 with value: 145.80280001327742.
[I 2025-06-13 12:46:52,003] Trial 2 finished with value: 153.21335042039473 and parameters: {'alpha': 0.7152076816293412, 'beta': 0.36789492870275453}. Best is trial 1 with value: 145.80280001327742.

saved: 13_RL_agent_TDlearn_output\plots\plot_07_04_2025_22_51_04.pdf


[I 2025-06-13 12:47:15,026] Trial 2 finished with value: 162.37700257557486 and parameters: {'alpha': 0.34006191873090547, 'beta': 0.8012598752562351}. Best is trial 2 with value: 162.37700257557486.
[I 2025-06-13 12:47:15,047] Trial 3 finished with value: 172.57941077959435 and parameters: {'alpha': 0.4587860197805585, 'beta': 0.8911039937600885}. Best is trial 2 with value: 162.37700257557486.
[I 2025-06-13 12:47:15,066] Trial 5 finished with value: 188.77146606025363 and parameters: {'alpha': 0.018571806056087284, 'beta': 0.09753222852976898}. Best is trial 2 with value: 162.37700257557486.
[I 2025-06-13 12:47:15,081] Trial 0 finished with value: 151.64402631672436 and parameters: {'alpha': 0.5015566241092272, 'beta': 0.3422878940581447}. Best is trial 0 with value: 151.64402631672436.
[I 2025-06-13 12:47:15,087] Trial 15 finished with value: 180.9614353031157 and parameters: {'alpha': 0.32762848468634587, 'beta': 0.9571141651277886}. Best is trial 0 with value: 151.64402631672436.


saved: 13_RL_agent_TDlearn_output\plots\plot_07_11_2024_17_23_43.pdf


[I 2025-06-13 12:47:40,259] Trial 0 finished with value: 179.42037598517092 and parameters: {'alpha': 0.7297668817820707, 'beta': 0.22232532888170037}. Best is trial 0 with value: 179.42037598517092.
[I 2025-06-13 12:47:40,276] Trial 23 finished with value: 157.91775311609763 and parameters: {'alpha': 0.5043740904752991, 'beta': 0.6649213284405572}. Best is trial 23 with value: 157.91775311609763.
[I 2025-06-13 12:47:40,306] Trial 6 finished with value: 196.9739743500895 and parameters: {'alpha': 0.5846326132521503, 'beta': 0.15689952398164494}. Best is trial 23 with value: 157.91775311609763.
[I 2025-06-13 12:47:40,310] Trial 16 finished with value: 143.30366064612596 and parameters: {'alpha': 0.2286096243563051, 'beta': 0.665188218721531}. Best is trial 16 with value: 143.30366064612596.
[I 2025-06-13 12:47:40,429] Trial 9 finished with value: 163.0580667942054 and parameters: {'alpha': 0.9202284426728066, 'beta': 0.7482657981763003}. Best is trial 16 with value: 143.30366064612596.


saved: 13_RL_agent_TDlearn_output\plots\plot_08_11_2024_13_03_29.pdf


[I 2025-06-13 12:48:07,858] Trial 10 finished with value: 171.17281062401662 and parameters: {'alpha': 0.640041525167011, 'beta': 0.5219389683069255}. Best is trial 10 with value: 171.17281062401662.
[I 2025-06-13 12:48:07,876] Trial 12 finished with value: 300.4061454310454 and parameters: {'alpha': 0.7652616079189349, 'beta': 0.06528855447306448}. Best is trial 10 with value: 171.17281062401662.
[I 2025-06-13 12:48:07,912] Trial 0 finished with value: 173.5925477329816 and parameters: {'alpha': 0.8870091010242642, 'beta': 0.473880557759331}. Best is trial 10 with value: 171.17281062401662.
[I 2025-06-13 12:48:07,915] Trial 3 finished with value: 443.5793418367137 and parameters: {'alpha': 0.6766271074539341, 'beta': 0.011652314814928278}. Best is trial 10 with value: 171.17281062401662.
[I 2025-06-13 12:48:07,915] Trial 5 finished with value: 232.1517084321062 and parameters: {'alpha': 0.6489369461083369, 'beta': 0.16092485829519426}. Best is trial 10 with value: 171.17281062401662.


saved: 13_RL_agent_TDlearn_output\plots\plot_10_06_2025_16_06_19.pdf


[I 2025-06-13 12:48:34,900] Trial 3 finished with value: 151.97620523703574 and parameters: {'alpha': 0.8952069744206042, 'beta': 0.33712401088709915}. Best is trial 3 with value: 151.97620523703574.
[I 2025-06-13 12:48:34,912] Trial 0 finished with value: 161.6839292308579 and parameters: {'alpha': 0.44313797519335485, 'beta': 0.13146223418744574}. Best is trial 3 with value: 151.97620523703574.
[I 2025-06-13 12:48:34,914] Trial 7 finished with value: 176.50705835131816 and parameters: {'alpha': 0.6575357990910545, 'beta': 0.16834023455361183}. Best is trial 3 with value: 151.97620523703574.
[I 2025-06-13 12:48:34,916] Trial 13 finished with value: 176.9609605424633 and parameters: {'alpha': 0.8510600587923418, 'beta': 0.16665712748526637}. Best is trial 3 with value: 151.97620523703574.
[I 2025-06-13 12:48:34,919] Trial 25 finished with value: 139.98017448587012 and parameters: {'alpha': 0.3556554978293945, 'beta': 0.6064469500031808}. Best is trial 25 with value: 139.98017448587012.

saved: 13_RL_agent_TDlearn_output\plots\plot_11_11_2024_16_46_44.pdf


[I 2025-06-13 12:49:02,547] Trial 4 finished with value: 175.87027724227391 and parameters: {'alpha': 0.22735392071719843, 'beta': 0.1990634287888198}. Best is trial 4 with value: 175.87027724227391.
[I 2025-06-13 12:49:02,786] Trial 11 finished with value: 160.7146161133864 and parameters: {'alpha': 0.12687576477245202, 'beta': 0.26343836502175205}. Best is trial 11 with value: 160.7146161133864.
[I 2025-06-13 12:49:02,964] Trial 3 finished with value: 244.56036401472974 and parameters: {'alpha': 0.6173646540614182, 'beta': 0.08600245559726453}. Best is trial 11 with value: 160.7146161133864.
[I 2025-06-13 12:49:02,985] Trial 1 finished with value: 169.29381490226334 and parameters: {'alpha': 0.6032125956763568, 'beta': 0.8136371047717124}. Best is trial 11 with value: 160.7146161133864.
[I 2025-06-13 12:49:03,008] Trial 10 finished with value: 283.68112862244243 and parameters: {'alpha': 0.8078556322213516, 'beta': 0.04853893995059719}. Best is trial 11 with value: 160.7146161133864.

saved: 13_RL_agent_TDlearn_output\plots\plot_12_11_2024_00_15_17.pdf


[I 2025-06-13 12:49:32,155] Trial 31 finished with value: 149.03854251976165 and parameters: {'alpha': 0.686912290424677, 'beta': 0.6044452870729459}. Best is trial 31 with value: 149.03854251976165.
[I 2025-06-13 12:49:32,174] Trial 7 finished with value: 148.58089635901516 and parameters: {'alpha': 0.46839937425070505, 'beta': 0.6268644246998817}. Best is trial 7 with value: 148.58089635901516.
[I 2025-06-13 12:49:32,213] Trial 25 finished with value: 205.35809040654186 and parameters: {'alpha': 0.9074983431166055, 'beta': 0.08417810081482181}. Best is trial 7 with value: 148.58089635901516.
[I 2025-06-13 12:49:32,217] Trial 1 finished with value: 224.56124825120463 and parameters: {'alpha': 0.050207821790426335, 'beta': 0.03910874417811334}. Best is trial 7 with value: 148.58089635901516.
[I 2025-06-13 12:49:32,220] Trial 17 finished with value: 147.52873355135682 and parameters: {'alpha': 0.6075990576562595, 'beta': 0.5679081483899164}. Best is trial 17 with value: 147.528733551356

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_10_46_21.pdf


[I 2025-06-13 12:50:01,158] Trial 22 finished with value: 172.57217886967516 and parameters: {'alpha': 0.755298187167205, 'beta': 0.7576427859926228}. Best is trial 22 with value: 172.57217886967516.
[I 2025-06-13 12:50:01,177] Trial 30 finished with value: 168.56213911532902 and parameters: {'alpha': 0.33017049653002445, 'beta': 0.5811125538685363}. Best is trial 30 with value: 168.56213911532902.
[I 2025-06-13 12:50:01,186] Trial 20 finished with value: 182.46560006601726 and parameters: {'alpha': 0.551658522001207, 'beta': 0.9446376726752832}. Best is trial 30 with value: 168.56213911532902.
[I 2025-06-13 12:50:01,187] Trial 0 finished with value: 170.31852515251515 and parameters: {'alpha': 0.31338702509297645, 'beta': 0.5171100203465847}. Best is trial 30 with value: 168.56213911532902.
[I 2025-06-13 12:50:01,208] Trial 28 finished with value: 171.1765881199052 and parameters: {'alpha': 0.695020193781371, 'beta': 0.6480628686819019}. Best is trial 30 with value: 168.56213911532902

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_14_45_52.pdf


[I 2025-06-13 12:50:28,309] Trial 6 finished with value: 252.44849013574918 and parameters: {'alpha': 0.47901310176769785, 'beta': 0.05916254563132128}. Best is trial 6 with value: 252.44849013574918.
[I 2025-06-13 12:50:28,631] Trial 28 finished with value: 179.24983920973494 and parameters: {'alpha': 0.8170183754477901, 'beta': 0.932514349044386}. Best is trial 28 with value: 179.24983920973494.
[I 2025-06-13 12:50:28,926] Trial 2 finished with value: 167.24597111612763 and parameters: {'alpha': 0.3304495897298673, 'beta': 0.8444449328405307}. Best is trial 2 with value: 167.24597111612763.
[I 2025-06-13 12:50:28,935] Trial 0 finished with value: 168.20549005808726 and parameters: {'alpha': 0.6041955382056365, 'beta': 0.8141860074308008}. Best is trial 2 with value: 167.24597111612763.
[I 2025-06-13 12:50:28,986] Trial 7 finished with value: 169.72834110663243 and parameters: {'alpha': 0.04983560040691169, 'beta': 0.25372957871592317}. Best is trial 2 with value: 167.24597111612763.


saved: 13_RL_agent_TDlearn_output\plots\plot_14_11_2024_21_46_47.pdf


[I 2025-06-13 12:50:56,582] Trial 8 finished with value: 205.986140581406 and parameters: {'alpha': 0.5092559785885612, 'beta': 0.1728262274732496}. Best is trial 8 with value: 205.986140581406.
[I 2025-06-13 12:50:56,633] Trial 22 finished with value: 169.6301519038359 and parameters: {'alpha': 0.6513843168173983, 'beta': 0.8074232399651826}. Best is trial 22 with value: 169.6301519038359.
[I 2025-06-13 12:50:56,659] Trial 29 finished with value: 156.58963880835572 and parameters: {'alpha': 0.09393803522095967, 'beta': 0.28289313007324285}. Best is trial 29 with value: 156.58963880835572.
[I 2025-06-13 12:50:56,768] Trial 14 finished with value: 334.2092727506788 and parameters: {'alpha': 0.4569021827452382, 'beta': 0.022523485556073065}. Best is trial 29 with value: 156.58963880835572.
[I 2025-06-13 12:50:56,769] Trial 3 finished with value: 164.59715370640222 and parameters: {'alpha': 0.5241510730672841, 'beta': 0.43890817072945465}. Best is trial 29 with value: 156.58963880835572.


saved: 13_RL_agent_TDlearn_output\plots\plot_15_11_2024_11_43_48.pdf


[I 2025-06-13 12:51:24,002] Trial 20 finished with value: 160.17464170706492 and parameters: {'alpha': 0.8904130048124291, 'beta': 0.4545065423858834}. Best is trial 20 with value: 160.17464170706492.
[I 2025-06-13 12:51:24,245] Trial 16 finished with value: 161.78179620627688 and parameters: {'alpha': 0.9902645906215025, 'beta': 0.6923115323423475}. Best is trial 20 with value: 160.17464170706492.
[I 2025-06-13 12:51:24,445] Trial 30 finished with value: 155.23562627263456 and parameters: {'alpha': 0.2783377285617124, 'beta': 0.6152559397734718}. Best is trial 30 with value: 155.23562627263456.
[I 2025-06-13 12:51:24,513] Trial 29 finished with value: 204.94924980519573 and parameters: {'alpha': 0.5101724002841558, 'beta': 0.15531017657127602}. Best is trial 30 with value: 155.23562627263456.
[I 2025-06-13 12:51:24,514] Trial 1 finished with value: 151.35701943643352 and parameters: {'alpha': 0.3878634899135498, 'beta': 0.4756874151669807}. Best is trial 1 with value: 151.357019436433

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_15_25_39.pdf


[I 2025-06-13 12:51:51,780] Trial 11 finished with value: 284.9880534393016 and parameters: {'alpha': 0.1966608592679761, 'beta': 0.0610820111246216}. Best is trial 11 with value: 284.9880534393016.
[I 2025-06-13 12:51:51,998] Trial 3 finished with value: 170.06899480862626 and parameters: {'alpha': 0.1427222011319972, 'beta': 0.8018848782068422}. Best is trial 3 with value: 170.06899480862626.
[I 2025-06-13 12:51:52,090] Trial 13 finished with value: 168.68167213608305 and parameters: {'alpha': 0.943264408805157, 'beta': 0.7147556123775842}. Best is trial 13 with value: 168.68167213608305.
[I 2025-06-13 12:51:52,149] Trial 16 finished with value: 165.1137358334797 and parameters: {'alpha': 0.16585767024030473, 'beta': 0.7135747053037197}. Best is trial 16 with value: 165.1137358334797.
[I 2025-06-13 12:51:52,183] Trial 31 finished with value: 168.19676098695157 and parameters: {'alpha': 0.74845915424837, 'beta': 0.6949048313496593}. Best is trial 16 with value: 165.1137358334797.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_23_57_47.pdf


[I 2025-06-13 12:52:20,834] Trial 14 finished with value: 150.48956233553838 and parameters: {'alpha': 0.6968368585134624, 'beta': 0.6777253037294231}. Best is trial 14 with value: 150.48956233553838.
[I 2025-06-13 12:52:20,877] Trial 24 finished with value: 173.4573016083424 and parameters: {'alpha': 0.9998972677482174, 'beta': 0.12799683381695298}. Best is trial 14 with value: 150.48956233553838.
[I 2025-06-13 12:52:20,880] Trial 2 finished with value: 159.1476740130329 and parameters: {'alpha': 0.034513319028637586, 'beta': 0.7298073049092201}. Best is trial 14 with value: 150.48956233553838.
[I 2025-06-13 12:52:20,883] Trial 6 finished with value: 166.52273789961203 and parameters: {'alpha': 0.14132793011089456, 'beta': 0.8377369804655911}. Best is trial 14 with value: 150.48956233553838.
[I 2025-06-13 12:52:20,886] Trial 5 finished with value: 156.5781966373727 and parameters: {'alpha': 0.19969214433156413, 'beta': 0.22215452462625082}. Best is trial 14 with value: 150.48956233553

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_13_12_31.pdf


[I 2025-06-13 12:52:49,358] Trial 8 finished with value: 174.38572272597662 and parameters: {'alpha': 0.6107976212487048, 'beta': 0.8645550393177112}. Best is trial 8 with value: 174.38572272597662.
[I 2025-06-13 12:52:49,361] Trial 2 finished with value: 158.22703685869863 and parameters: {'alpha': 0.07352561025415469, 'beta': 0.6852541161903222}. Best is trial 2 with value: 158.22703685869863.
[I 2025-06-13 12:52:49,362] Trial 7 finished with value: 165.37330216014723 and parameters: {'alpha': 0.477898330325086, 'beta': 0.4482970067787679}. Best is trial 2 with value: 158.22703685869863.
[I 2025-06-13 12:52:49,385] Trial 23 finished with value: 164.59464060521321 and parameters: {'alpha': 0.4279173208791252, 'beta': 0.4389566934166686}. Best is trial 2 with value: 158.22703685869863.
[I 2025-06-13 12:52:49,391] Trial 0 finished with value: 168.8449608830469 and parameters: {'alpha': 0.8471812086010364, 'beta': 0.7808538109167965}. Best is trial 2 with value: 158.22703685869863.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_20_59_56.pdf


[I 2025-06-13 12:53:16,747] Trial 1 finished with value: 185.27577070866806 and parameters: {'alpha': 0.4114341630483652, 'beta': 0.21473015468647938}. Best is trial 1 with value: 185.27577070866806.
[I 2025-06-13 12:53:16,773] Trial 4 finished with value: 158.54277550347157 and parameters: {'alpha': 0.37312874048190187, 'beta': 0.46799346436977945}. Best is trial 4 with value: 158.54277550347157.
[I 2025-06-13 12:53:16,794] Trial 31 finished with value: 233.62536236877526 and parameters: {'alpha': 0.48687855065094177, 'beta': 0.11105161449458342}. Best is trial 4 with value: 158.54277550347157.
[I 2025-06-13 12:53:16,927] Trial 10 finished with value: 160.33397136477345 and parameters: {'alpha': 0.08563453385475449, 'beta': 0.6885628474183457}. Best is trial 4 with value: 158.54277550347157.
[I 2025-06-13 12:53:16,966] Trial 0 finished with value: 175.71913024710773 and parameters: {'alpha': 0.5357542297142457, 'beta': 0.8869849828198879}. Best is trial 4 with value: 158.5427755034715

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_13_31_43.pdf


[I 2025-06-13 12:53:44,023] Trial 31 finished with value: 173.39562466717197 and parameters: {'alpha': 0.7403453931495094, 'beta': 0.8688118132868542}. Best is trial 31 with value: 173.39562466717197.
[I 2025-06-13 12:53:44,035] Trial 13 finished with value: 183.1394328484198 and parameters: {'alpha': 0.9051438890685694, 'beta': 0.965850734456434}. Best is trial 31 with value: 173.39562466717197.
[I 2025-06-13 12:53:44,101] Trial 0 finished with value: 155.03756885281007 and parameters: {'alpha': 0.2819642044811079, 'beta': 0.4653144076600203}. Best is trial 0 with value: 155.03756885281007.
[I 2025-06-13 12:53:44,208] Trial 10 finished with value: 166.95319831722077 and parameters: {'alpha': 0.910992398564123, 'beta': 0.34713103307408383}. Best is trial 0 with value: 155.03756885281007.
[I 2025-06-13 12:53:44,271] Trial 20 finished with value: 173.01435499739532 and parameters: {'alpha': 0.6034381529546778, 'beta': 0.8644568582976561}. Best is trial 0 with value: 155.03756885281007.
[

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_15_43_17.pdf


[I 2025-06-13 12:54:12,277] Trial 0 finished with value: 167.62360616447552 and parameters: {'alpha': 0.1045303556959125, 'beta': 0.12122123280073994}. Best is trial 0 with value: 167.62360616447552.
[I 2025-06-13 12:54:12,739] Trial 11 finished with value: 163.66647687263702 and parameters: {'alpha': 0.9665782435978387, 'beta': 0.731419787483023}. Best is trial 11 with value: 163.66647687263702.
[I 2025-06-13 12:54:12,822] Trial 3 finished with value: 160.44447600366016 and parameters: {'alpha': 0.1642495258873471, 'beta': 0.7815277726959994}. Best is trial 3 with value: 160.44447600366016.
[I 2025-06-13 12:54:12,831] Trial 13 finished with value: 165.58408677114966 and parameters: {'alpha': 0.8678407549972034, 'beta': 0.362432238965621}. Best is trial 3 with value: 160.44447600366016.
[I 2025-06-13 12:54:12,853] Trial 1 finished with value: 173.56006321184358 and parameters: {'alpha': 0.8776681535849438, 'beta': 0.2902825636974068}. Best is trial 3 with value: 160.44447600366016.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_14_28_20.pdf


[I 2025-06-13 12:54:42,304] Trial 2 finished with value: 152.80236318057496 and parameters: {'alpha': 0.8322593645702342, 'beta': 0.6152667690797955}. Best is trial 2 with value: 152.80236318057496.
[I 2025-06-13 12:54:42,689] Trial 25 finished with value: 151.8008933723684 and parameters: {'alpha': 0.7149271275527286, 'beta': 0.39850256629467107}. Best is trial 25 with value: 151.8008933723684.
[I 2025-06-13 12:54:42,716] Trial 19 finished with value: 177.4829945289053 and parameters: {'alpha': 0.3105776331380987, 'beta': 0.9334346048730575}. Best is trial 25 with value: 151.8008933723684.
[I 2025-06-13 12:54:42,790] Trial 26 finished with value: 153.43549520715578 and parameters: {'alpha': 0.7797432849388993, 'beta': 0.6302898634021183}. Best is trial 25 with value: 151.8008933723684.
[I 2025-06-13 12:54:42,835] Trial 6 finished with value: 150.21198250350716 and parameters: {'alpha': 0.9710155301154613, 'beta': 0.4721932514238163}. Best is trial 6 with value: 150.21198250350716.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_17_03_01.pdf


[I 2025-06-13 12:55:11,516] Trial 24 finished with value: 148.00248662030796 and parameters: {'alpha': 0.44452829707200775, 'beta': 0.3514799741907805}. Best is trial 24 with value: 148.00248662030796.
[I 2025-06-13 12:55:11,536] Trial 19 finished with value: 129.39659971834772 and parameters: {'alpha': 0.2610569639176861, 'beta': 0.3605478761763876}. Best is trial 19 with value: 129.39659971834772.
[I 2025-06-13 12:55:11,567] Trial 26 finished with value: 151.1738077091009 and parameters: {'alpha': 0.8985458397523072, 'beta': 0.3793628200020936}. Best is trial 19 with value: 129.39659971834772.
[I 2025-06-13 12:55:11,567] Trial 25 finished with value: 156.77163729674336 and parameters: {'alpha': 0.10248003030570878, 'beta': 0.8045746057299775}. Best is trial 19 with value: 129.39659971834772.
[I 2025-06-13 12:55:11,594] Trial 20 finished with value: 160.15402806375675 and parameters: {'alpha': 0.7603709193752506, 'beta': 0.2678704444857281}. Best is trial 19 with value: 129.3965997183

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_19_42_32.pdf


[I 2025-06-13 12:55:40,470] Trial 12 finished with value: 175.75145866600573 and parameters: {'alpha': 0.8832652416922568, 'beta': 0.8898992136730232}. Best is trial 12 with value: 175.75145866600573.
[I 2025-06-13 12:55:40,497] Trial 7 finished with value: 204.9625169340141 and parameters: {'alpha': 0.18844011315353995, 'beta': 0.12347923123003535}. Best is trial 12 with value: 175.75145866600573.
[I 2025-06-13 12:55:40,679] Trial 0 finished with value: 233.3630663542189 and parameters: {'alpha': 0.09450621652541326, 'beta': 0.10206202589619344}. Best is trial 12 with value: 175.75145866600573.
[I 2025-06-13 12:55:40,751] Trial 2 finished with value: 182.17208547906807 and parameters: {'alpha': 0.4759966100383237, 'beta': 0.2389012173234747}. Best is trial 12 with value: 175.75145866600573.
[I 2025-06-13 12:55:40,864] Trial 4 finished with value: 167.4157715098209 and parameters: {'alpha': 0.0921385913968296, 'beta': 0.7863169728054257}. Best is trial 4 with value: 167.4157715098209.


saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_09_23_29.pdf


[I 2025-06-13 12:56:10,009] Trial 9 finished with value: 128.50922034728592 and parameters: {'alpha': 0.1816249616532401, 'beta': 0.5051479950130213}. Best is trial 9 with value: 128.50922034728592.
[I 2025-06-13 12:56:10,272] Trial 7 finished with value: 126.0009665188341 and parameters: {'alpha': 0.07450003044135968, 'beta': 0.2426214306681837}. Best is trial 7 with value: 126.0009665188341.
[I 2025-06-13 12:56:10,388] Trial 3 finished with value: 185.0152111434046 and parameters: {'alpha': 0.6903224612911515, 'beta': 0.19770880069464875}. Best is trial 7 with value: 126.0009665188341.
[I 2025-06-13 12:56:10,547] Trial 5 finished with value: 239.90993269403123 and parameters: {'alpha': 0.26267890170226327, 'beta': 0.013945268120598108}. Best is trial 7 with value: 126.0009665188341.
[I 2025-06-13 12:56:10,554] Trial 18 finished with value: 155.62636655760735 and parameters: {'alpha': 0.8523163506593807, 'beta': 0.5059906196681636}. Best is trial 7 with value: 126.0009665188341.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_14_51_17.pdf


[I 2025-06-13 12:56:38,734] Trial 2 finished with value: 173.3239519670441 and parameters: {'alpha': 0.09721278203440151, 'beta': 0.8789039666408115}. Best is trial 2 with value: 173.3239519670441.
[I 2025-06-13 12:56:39,181] Trial 3 finished with value: 181.52334804338437 and parameters: {'alpha': 0.29737320904317777, 'beta': 0.9560964724740977}. Best is trial 2 with value: 173.3239519670441.
[I 2025-06-13 12:56:39,185] Trial 5 finished with value: 334.14283791351835 and parameters: {'alpha': 0.1337127246043191, 'beta': 0.016149187776782}. Best is trial 2 with value: 173.3239519670441.
[I 2025-06-13 12:56:39,189] Trial 27 finished with value: 197.7863029206706 and parameters: {'alpha': 0.3924472563456813, 'beta': 0.1340768645302308}. Best is trial 2 with value: 173.3239519670441.
[I 2025-06-13 12:56:39,197] Trial 12 finished with value: 156.01062311959996 and parameters: {'alpha': 0.818055787113884, 'beta': 0.6140673180113936}. Best is trial 12 with value: 156.01062311959996.
[I 2025-

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_14_56.pdf


[I 2025-06-13 12:57:08,028] Trial 9 finished with value: 158.1172376619834 and parameters: {'alpha': 0.693303491041044, 'beta': 0.6311011458296694}. Best is trial 9 with value: 158.1172376619834.
[I 2025-06-13 12:57:08,069] Trial 12 finished with value: 186.8684794785753 and parameters: {'alpha': 0.018364245316495754, 'beta': 0.997864587547571}. Best is trial 9 with value: 158.1172376619834.
[I 2025-06-13 12:57:08,084] Trial 16 finished with value: 152.79725107798788 and parameters: {'alpha': 0.16347704641359592, 'beta': 0.30418692252214397}. Best is trial 16 with value: 152.79725107798788.
[I 2025-06-13 12:57:08,359] Trial 4 finished with value: 321.98693399511995 and parameters: {'alpha': 0.590724296863869, 'beta': 0.02360629571813224}. Best is trial 16 with value: 152.79725107798788.
[I 2025-06-13 12:57:08,499] Trial 3 finished with value: 155.32088119477203 and parameters: {'alpha': 0.08565426761753976, 'beta': 0.24553384577163775}. Best is trial 16 with value: 152.79725107798788.


saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_41_35.pdf


[I 2025-06-13 12:57:36,934] Trial 21 finished with value: 213.81108828102657 and parameters: {'alpha': 0.244183333527593, 'beta': 0.25482070682563507}. Best is trial 21 with value: 213.81108828102657.
[I 2025-06-13 12:57:37,274] Trial 10 finished with value: 179.11236527354137 and parameters: {'alpha': 0.7447830255734978, 'beta': 0.4783302501051562}. Best is trial 10 with value: 179.11236527354137.
[I 2025-06-13 12:57:37,352] Trial 0 finished with value: 181.06076165965555 and parameters: {'alpha': 0.6030834011195041, 'beta': 0.4535607286378225}. Best is trial 10 with value: 179.11236527354137.
[I 2025-06-13 12:57:37,414] Trial 7 finished with value: 173.12463966314246 and parameters: {'alpha': 0.6137035753273374, 'beta': 0.7369644158282999}. Best is trial 7 with value: 173.12463966314246.
[I 2025-06-13 12:57:37,424] Trial 6 finished with value: 192.99573029605256 and parameters: {'alpha': 0.9431276390905706, 'beta': 0.3503239785629846}. Best is trial 7 with value: 173.12463966314246.


saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_16_58_23.pdf


[I 2025-06-13 12:58:04,916] Trial 22 finished with value: 210.05647610794574 and parameters: {'alpha': 0.9335176274787922, 'beta': 0.17467183643588371}. Best is trial 22 with value: 210.05647610794574.
[I 2025-06-13 12:58:05,154] Trial 23 finished with value: 166.5263023060229 and parameters: {'alpha': 0.2589587596020568, 'beta': 0.8242924316632652}. Best is trial 23 with value: 166.5263023060229.
[I 2025-06-13 12:58:05,180] Trial 13 finished with value: 185.96089518608787 and parameters: {'alpha': 0.4966683678392494, 'beta': 0.2595506023881947}. Best is trial 23 with value: 166.5263023060229.
[I 2025-06-13 12:58:05,200] Trial 3 finished with value: 153.61817369279342 and parameters: {'alpha': 0.4035531993249629, 'beta': 0.481301805979447}. Best is trial 3 with value: 153.61817369279342.
[I 2025-06-13 12:58:05,203] Trial 6 finished with value: 163.80207843596668 and parameters: {'alpha': 0.7093154956807065, 'beta': 0.5258735573955202}. Best is trial 3 with value: 153.61817369279342.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_22_03_2025_00_10_37.pdf


[I 2025-06-13 12:58:32,453] Trial 0 finished with value: 125.01431606976521 and parameters: {'alpha': 0.38113537317160584, 'beta': 0.37771548447142295}. Best is trial 0 with value: 125.01431606976521.
[I 2025-06-13 12:58:32,468] Trial 7 finished with value: 132.32357901937706 and parameters: {'alpha': 0.42482176040768715, 'beta': 0.5337388220941658}. Best is trial 0 with value: 125.01431606976521.
[I 2025-06-13 12:58:32,903] Trial 27 finished with value: 141.84452967615073 and parameters: {'alpha': 0.6928481700354967, 'beta': 0.22366919829877427}. Best is trial 0 with value: 125.01431606976521.
[I 2025-06-13 12:58:32,969] Trial 1 finished with value: 186.82352880556394 and parameters: {'alpha': 0.23768103347262054, 'beta': 0.9982439467034916}. Best is trial 0 with value: 125.01431606976521.
[I 2025-06-13 12:58:33,025] Trial 11 finished with value: 143.9626673692736 and parameters: {'alpha': 0.6606092745653488, 'beta': 0.20670291998958068}. Best is trial 0 with value: 125.01431606976521

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_12_34_30.pdf


[I 2025-06-13 12:59:01,075] Trial 15 finished with value: 164.82606340001877 and parameters: {'alpha': 0.8890701377900894, 'beta': 0.7603082635820007}. Best is trial 15 with value: 164.82606340001877.
[I 2025-06-13 12:59:01,085] Trial 3 finished with value: 154.9126020006105 and parameters: {'alpha': 0.41374315335891704, 'beta': 0.4719735446603905}. Best is trial 3 with value: 154.9126020006105.
[I 2025-06-13 12:59:01,107] Trial 5 finished with value: 172.58493307625847 and parameters: {'alpha': 0.028470152556177596, 'beta': 0.8445370837022885}. Best is trial 3 with value: 154.9126020006105.
[I 2025-06-13 12:59:01,272] Trial 7 finished with value: 171.28139611419627 and parameters: {'alpha': 0.8094335357975688, 'beta': 0.29404005474146705}. Best is trial 3 with value: 154.9126020006105.
[I 2025-06-13 12:59:01,502] Trial 26 finished with value: 154.68403791213174 and parameters: {'alpha': 0.34462125032757734, 'beta': 0.43103826641896115}. Best is trial 26 with value: 154.68403791213174.

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_14_36_42.pdf


[I 2025-06-13 12:59:30,535] Trial 9 finished with value: 174.68780063470223 and parameters: {'alpha': 0.21112508090911547, 'beta': 0.17534260108524422}. Best is trial 9 with value: 174.68780063470223.
[I 2025-06-13 12:59:30,552] Trial 14 finished with value: 148.36955095658746 and parameters: {'alpha': 0.2410320333419147, 'beta': 0.42332516814739723}. Best is trial 14 with value: 148.36955095658746.
[I 2025-06-13 12:59:30,918] Trial 27 finished with value: 186.60135350250215 and parameters: {'alpha': 0.9204868631856135, 'beta': 0.13587099337894354}. Best is trial 14 with value: 148.36955095658746.
[I 2025-06-13 12:59:30,969] Trial 20 finished with value: 185.68760303256641 and parameters: {'alpha': 0.805509047456733, 'beta': 0.989600827103161}. Best is trial 14 with value: 148.36955095658746.
[I 2025-06-13 12:59:30,994] Trial 29 finished with value: 152.805626695507 and parameters: {'alpha': 0.07166091000479374, 'beta': 0.6482807685715662}. Best is trial 14 with value: 148.369550956587

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_15_19_47.pdf


[I 2025-06-13 12:59:59,212] Trial 19 finished with value: 151.18473033358765 and parameters: {'alpha': 0.5652844292606315, 'beta': 0.5948645595958703}. Best is trial 19 with value: 151.18473033358765.
[I 2025-06-13 12:59:59,572] Trial 5 finished with value: 148.51744459924157 and parameters: {'alpha': 0.446967668515541, 'beta': 0.3731610819645256}. Best is trial 5 with value: 148.51744459924157.
[I 2025-06-13 12:59:59,651] Trial 4 finished with value: 148.68943269423806 and parameters: {'alpha': 0.4774370732456279, 'beta': 0.5967563578399494}. Best is trial 5 with value: 148.51744459924157.
[I 2025-06-13 12:59:59,654] Trial 7 finished with value: 153.15797051178842 and parameters: {'alpha': 0.7128162948371387, 'beta': 0.6406887878009219}. Best is trial 5 with value: 148.51744459924157.
[I 2025-06-13 12:59:59,723] Trial 0 finished with value: 172.4458126343547 and parameters: {'alpha': 0.43458498435425846, 'beta': 0.18458721698802674}. Best is trial 5 with value: 148.51744459924157.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_07_37_11.pdf


[I 2025-06-13 13:00:23,936] Trial 0 finished with value: 160.92831419902544 and parameters: {'alpha': 0.4603478496995097, 'beta': 0.5117881123811611}. Best is trial 0 with value: 160.92831419902544.
[I 2025-06-13 13:00:24,433] Trial 1 finished with value: 198.2788869980495 and parameters: {'alpha': 0.8324271599279294, 'beta': 0.19769618267485745}. Best is trial 0 with value: 160.92831419902544.
[I 2025-06-13 13:00:24,681] Trial 9 finished with value: 251.7804487879672 and parameters: {'alpha': 0.8491166760701424, 'beta': 0.08473565179807573}. Best is trial 0 with value: 160.92831419902544.
[I 2025-06-13 13:00:24,721] Trial 8 finished with value: 210.1469101354012 and parameters: {'alpha': 0.331136107894768, 'beta': 0.13592669303621518}. Best is trial 0 with value: 160.92831419902544.
[I 2025-06-13 13:00:24,736] Trial 19 finished with value: 161.74831594120573 and parameters: {'alpha': 0.4137415748106956, 'beta': 0.37345037596804875}. Best is trial 0 with value: 160.92831419902544.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_12_11_10.pdf


[I 2025-06-13 13:00:49,830] Trial 18 finished with value: 173.46941285754212 and parameters: {'alpha': 0.4181607089138693, 'beta': 0.5405867596093176}. Best is trial 18 with value: 173.46941285754212.
[I 2025-06-13 13:00:49,842] Trial 29 finished with value: 178.48841367920494 and parameters: {'alpha': 0.01808021942495753, 'beta': 0.8839494057815466}. Best is trial 18 with value: 173.46941285754212.
[I 2025-06-13 13:00:49,877] Trial 16 finished with value: 170.14015277819428 and parameters: {'alpha': 0.6876024434935438, 'beta': 0.5519193428157699}. Best is trial 16 with value: 170.14015277819428.
[I 2025-06-13 13:00:49,882] Trial 2 finished with value: 167.03380933573874 and parameters: {'alpha': 0.28157605064415, 'beta': 0.5629500556541129}. Best is trial 2 with value: 167.03380933573874.
[I 2025-06-13 13:00:49,895] Trial 17 finished with value: 240.07427576045026 and parameters: {'alpha': 0.23031286573192314, 'beta': 0.11914448076868817}. Best is trial 2 with value: 167.0338093357387

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_18_41_38.pdf


[I 2025-06-13 13:01:15,060] Trial 9 finished with value: 162.333824733788 and parameters: {'alpha': 0.6164785580903468, 'beta': 0.5661751793193033}. Best is trial 9 with value: 162.333824733788.
[I 2025-06-13 13:01:15,074] Trial 0 finished with value: 266.9565581347251 and parameters: {'alpha': 0.871454725074551, 'beta': 0.07136611078331875}. Best is trial 9 with value: 162.333824733788.
[I 2025-06-13 13:01:15,079] Trial 23 finished with value: 181.23632784009797 and parameters: {'alpha': 0.5809409176312086, 'beta': 0.9445364645369034}. Best is trial 9 with value: 162.333824733788.
[I 2025-06-13 13:01:15,165] Trial 27 finished with value: 168.8988861857534 and parameters: {'alpha': 0.9094966610637336, 'beta': 0.39325024150749965}. Best is trial 9 with value: 162.333824733788.
[I 2025-06-13 13:01:15,308] Trial 25 finished with value: 175.23744703451723 and parameters: {'alpha': 0.40210064023826486, 'beta': 0.27878230788812486}. Best is trial 9 with value: 162.333824733788.
[I 2025-06-13

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_20_12_41.pdf


[I 2025-06-13 13:01:41,113] Trial 26 finished with value: 159.43031939165598 and parameters: {'alpha': 0.4215608067093995, 'beta': 0.667850658680143}. Best is trial 26 with value: 159.43031939165598.
[I 2025-06-13 13:01:41,137] Trial 11 finished with value: 163.56027335760712 and parameters: {'alpha': 0.36326274469990755, 'beta': 0.7698131486393501}. Best is trial 26 with value: 159.43031939165598.
[I 2025-06-13 13:01:41,146] Trial 24 finished with value: 270.7793938817998 and parameters: {'alpha': 0.05088998680760008, 'beta': 0.05264561108423572}. Best is trial 26 with value: 159.43031939165598.
[I 2025-06-13 13:01:41,149] Trial 0 finished with value: 158.16170100525034 and parameters: {'alpha': 0.7147847383534713, 'beta': 0.6538030450485293}. Best is trial 0 with value: 158.16170100525034.
[I 2025-06-13 13:01:41,153] Trial 6 finished with value: 157.74323870898428 and parameters: {'alpha': 0.458585533167871, 'beta': 0.45396834489238147}. Best is trial 6 with value: 157.74323870898428

saved: 13_RL_agent_TDlearn_output\plots\plot_26_03_2025_16_21_25.pdf


[I 2025-06-13 13:02:07,267] Trial 24 finished with value: 174.27797493841908 and parameters: {'alpha': 0.9730088021366697, 'beta': 0.6369147843116174}. Best is trial 24 with value: 174.27797493841908.
[I 2025-06-13 13:02:07,273] Trial 0 finished with value: 252.38850591253197 and parameters: {'alpha': 0.244724407810668, 'beta': 0.13065736008964088}. Best is trial 24 with value: 174.27797493841908.
[I 2025-06-13 13:02:07,476] Trial 6 finished with value: 171.5588065251593 and parameters: {'alpha': 0.05452054544138159, 'beta': 0.7111684164166702}. Best is trial 6 with value: 171.5588065251593.
[I 2025-06-13 13:02:07,590] Trial 23 finished with value: 177.8208368444505 and parameters: {'alpha': 0.5629770409272973, 'beta': 0.5333310720542522}. Best is trial 6 with value: 171.5588065251593.
[I 2025-06-13 13:02:07,618] Trial 3 finished with value: 297.95874568298 and parameters: {'alpha': 0.9838999862745985, 'beta': 0.08851750480245374}. Best is trial 6 with value: 171.5588065251593.
[I 2025

saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_10_53_23.pdf


[I 2025-06-13 13:02:34,128] Trial 10 finished with value: 145.8592370200203 and parameters: {'alpha': 0.9236770697365048, 'beta': 0.14211214457350377}. Best is trial 10 with value: 145.8592370200203.
[I 2025-06-13 13:02:34,291] Trial 4 finished with value: 125.3055677270456 and parameters: {'alpha': 0.2702079136081482, 'beta': 0.25480366075786104}. Best is trial 4 with value: 125.3055677270456.
[I 2025-06-13 13:02:34,295] Trial 9 finished with value: 143.96201906222214 and parameters: {'alpha': 0.6154720390980517, 'beta': 0.15092432509033357}. Best is trial 4 with value: 125.3055677270456.
[I 2025-06-13 13:02:34,301] Trial 3 finished with value: 129.1666648583331 and parameters: {'alpha': 0.40808784339745086, 'beta': 0.22661743909720078}. Best is trial 4 with value: 125.3055677270456.
[I 2025-06-13 13:02:34,302] Trial 21 finished with value: 127.65302326104037 and parameters: {'alpha': 0.28931788106478695, 'beta': 0.4900321535733373}. Best is trial 4 with value: 125.3055677270456.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_14_31_40.pdf


[I 2025-06-13 13:02:59,854] Trial 0 finished with value: 164.9965427312291 and parameters: {'alpha': 0.24890361615760503, 'beta': 0.4314574573638751}. Best is trial 0 with value: 164.9965427312291.
[I 2025-06-13 13:02:59,938] Trial 28 finished with value: 174.9295287237514 and parameters: {'alpha': 0.6309031669598648, 'beta': 0.8564780641141598}. Best is trial 0 with value: 164.9965427312291.
[I 2025-06-13 13:02:59,975] Trial 2 finished with value: 188.72814011227302 and parameters: {'alpha': 0.2717686522556071, 'beta': 0.23627694118044879}. Best is trial 0 with value: 164.9965427312291.
[I 2025-06-13 13:03:00,278] Trial 9 finished with value: 185.67410132349715 and parameters: {'alpha': 0.8646675429043923, 'beta': 0.9855407699148241}. Best is trial 0 with value: 164.9965427312291.
[I 2025-06-13 13:03:00,312] Trial 12 finished with value: 210.79887196048156 and parameters: {'alpha': 0.9762718896140555, 'beta': 0.2000582132825554}. Best is trial 0 with value: 164.9965427312291.
[I 2025-

saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_12_21_16.pdf


[I 2025-06-13 13:03:25,290] Trial 20 finished with value: 171.05684127072246 and parameters: {'alpha': 0.2653128556620066, 'beta': 0.38438431490889385}. Best is trial 20 with value: 171.05684127072246.
[I 2025-06-13 13:03:25,302] Trial 6 finished with value: 163.48597710618714 and parameters: {'alpha': 0.23542551435049783, 'beta': 0.6646042350771538}. Best is trial 6 with value: 163.48597710618714.
[I 2025-06-13 13:03:25,303] Trial 5 finished with value: 163.91831376576047 and parameters: {'alpha': 0.10598480025954117, 'beta': 0.6527864660590629}. Best is trial 6 with value: 163.48597710618714.
[I 2025-06-13 13:03:25,313] Trial 1 finished with value: 279.2445031151062 and parameters: {'alpha': 0.11826685844590155, 'beta': 0.06302808966480411}. Best is trial 6 with value: 163.48597710618714.
[I 2025-06-13 13:03:25,317] Trial 16 finished with value: 172.0248630314536 and parameters: {'alpha': 0.24841743849273384, 'beta': 0.8368475499610928}. Best is trial 6 with value: 163.48597710618714

saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_22_38_25.pdf


# now saving the model evaluation values

In [11]:
df_models_evaluation = pd.DataFrame({
    "participants": participants,
    "best_alpha": best_alpha_models,
    "best_beta": best_beta_models,
    "BIC": BIC_models,
    "AIC": AIC_models,
    "accuracy": accuracy_models,
    "precision": precision_models,
    "sensitivity_recall": sensitivity_recall_models,
    "specificity": specificity_models,
    "f1_score": f1_score_models,
    "mcFadden_r2": mcFadden_r2_models,
    "r2": r2_models
})

file_path = os.path.join(output_dir_model_evaluation, "models_evaluation.csv")
df_models_evaluation.to_csv(file_path, index=False)