# TD learning; the agent that can see

# Remember to check the number of samples for alpha and beta

now I'm gonna add numbers to the model. 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import confusion_matrix
# np.random.seed(42)
from joblib import Parallel, delayed
import matplotlib.tri as tri
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
from scipy.interpolate import RBFInterpolator
import matplotlib.ticker as mticker
import itertools
from sklearn.metrics import r2_score
import optuna

# important directories

In [2]:
folder_path = 'data_risk_added'


output_dir_model_evaluation = "13_RL_agent_TDlearn_output"
os.makedirs(output_dir_model_evaluation, exist_ok=True)

output_dir_plots = os.path.join(output_dir_model_evaluation, "plots")
os.makedirs(output_dir_plots, exist_ok=True)

output_dir_model_behavior = os.path.join(output_dir_model_evaluation, "model_behavior")
os.makedirs(output_dir_model_behavior, exist_ok=True)

In [3]:

dataframes = [pd.read_excel(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.xlsx')]


n_participant = len(dataframes)
print(f"there are {n_participant} participants.")


dataframes[2]

there are 35 participants.


Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,is_within_IQR,risk
0,1067,uniform,894,win,3,9,3575,10.5,0,response,arrowdown,1,0,0,0.250
1,237,uniform,925,lose,2,1,903,10.0,1,response,arrowdown,1,0,1,0.125
2,231,uniform,973,win,8,2,1130,10.5,2,response,arrowup,1,0,1,0.125
3,602,uniform,768,win,7,2,537,11.0,3,response,arrowup,1,0,1,0.250
4,273,uniform,937,win,9,8,809,11.5,4,response,arrowup,1,0,1,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,567,high,787,win,9,7,1580,92.0,128,response,arrowup,4,0,0,0.000
266,151,low,854,win,6,3,8541,92.5,89,response,arrowup,4,0,0,0.146
267,178,low,863,win,8,5,370,93.0,77,response,arrowup,4,0,1,0.023
268,328,high,770,lose,6,4,644,92.5,95,response,arrowdown,4,0,1,0.385


### I want to make participant file name for the model_evaluation.csv and that is I'm gonna take each data name task_data_07_11_2024_17_23_43.xlsx and extract "07_11_2024_17_23_43" and this should be the participant name in the dataset. 

In [4]:
participants = [os.path.splitext(file)[0].replace("task_data_", "")
    for file in os.listdir(folder_path) if file.endswith('.xlsx')]

# policy initilization for the model
now I need to find the prior policy amounts. for that I am going to put the percentage of downarrow and up arrow for each distribution.

In [5]:
df_combined = pd.concat(dataframes, ignore_index=True)

df_combined = df_combined[df_combined['outcome'].str.lower() != 'na'].reset_index(drop=True)
 

desired_order = ["uniform", "low", "high"]  


cards_sorted = sorted(df_combined["myCard"].unique())
dist_sorted = [d for d in desired_order if d in df_combined["distribution"].unique()]
choice_sorted = sorted(df_combined["choice"].unique())


card_idx = {card: i for i, card in enumerate(cards_sorted)}
dist_idx = {dist: i for i, dist in enumerate(dist_sorted)}
choice_idx = {choice: i for i, choice in enumerate(choice_sorted)}


matrix_3d = np.zeros((len(cards_sorted), len(dist_sorted), len(choice_sorted)))


for _, row in df_combined.iterrows():
    i = card_idx[row["myCard"]]-1
    j = dist_idx[row["distribution"]]
    k = choice_idx[row["choice"]]
    matrix_3d[i, j, k] += 1  


total_per_card_dist = matrix_3d.sum(axis=2, keepdims=True)

# compute percentages, avoiding division by zero
with np.errstate(divide='ignore', invalid='ignore'):
    percentage_matrix = np.divide(matrix_3d, total_per_card_dist, where=total_per_card_dist != 0)

# convert to a DataFrame for easy visualization
percentage_list = []
for i, card in enumerate(cards_sorted):
    for j, dist in enumerate(dist_sorted):
        for k, choice in enumerate(choice_sorted):
            percentage_list.append({
                "myCard": card,
                "distribution": dist,  # Now follows "uniform", "low", "high" order
                "choice": choice,
                "percentage": percentage_matrix[i, j, k]
            })

df_percentages = pd.DataFrame(percentage_list)
df_percentages

Unnamed: 0,myCard,distribution,choice,percentage
0,1,uniform,arrowdown,0.965217
1,1,uniform,arrowup,0.034783
2,1,low,arrowdown,0.922399
3,1,low,arrowup,0.077601
4,1,high,arrowdown,0.958042
5,1,high,arrowup,0.041958
6,2,uniform,arrowdown,0.967836
7,2,uniform,arrowup,0.032164
8,2,low,arrowdown,0.748479
9,2,low,arrowup,0.251521


In [6]:
np.shape(percentage_matrix)

(9, 3, 2)

In [7]:
actions = { "arrowdown": 0, "arrowup": 1}
distributions_map = { "uniform": 0, "low": 1,  "high": 2}
card_numbers = list(range(1, 10))

policy_table = percentage_matrix 

Q_table_init = np.random.normal(0, 0.01, (len(card_numbers), len(distributions_map), len(actions)))
# having a q-table based on the policies
# Q_table_init = policy_table * np.mean(Q_table_init) 
Q_table = Q_table_init.copy()

#############################################################################################
# having a q-table that starts with 0! this was not a good initilization so i changed it.
# Q_table = np.zeros((len(distributions_map), len(actions)))  # 3 distributions × 2 actions
#############################################################################################

# print("policy: \n",np.shape(policy_table))
print("\n Q_table: \n",np.shape(Q_table))




 Q_table: 
 (9, 3, 2)


In [8]:
def epsilon_greedy(Q_values, epsilon):    
    probs = np.full_like(Q_values, epsilon / Q_values.shape[-1], dtype=float)  # initialize with exploration probability
    best_actions = np.argmax(Q_values, axis=-1)  # find the best action for each state
    np.put_along_axis(probs, np.expand_dims(best_actions, axis=-1), 1 - epsilon + (epsilon / Q_values.shape[-1]), axis=-1)
    return probs



def train_rescorla_wagner(df, alpha, beta, Q_init=None):
    if Q_init is None:
        Q_init = Q_table.copy()
    Q_values = Q_init.copy()
    q_value_pairs = []
    choices = []
    predicted_probs = []
    distributions = []
    card_numbers = []
    
    for _, row in df.iterrows():
        action = actions[row["choice"]] 
        distribution = distributions_map[row["distribution"]] 
        card_number = row["myCard"]-1 # since I'm using this as an index! I need to do -1 to make the 1 to 9 cards come to 0 to 8
        reward = 0.5 if row["outcome"] == "win" else -0.5


        probs = epsilon_greedy(Q_values, beta)
        predicted_probs.append(probs[card_number][distribution][action])
        
        prediction_error = reward - Q_values[card_number][distribution][action]
        Q_values[card_number][distribution][action] += alpha * prediction_error
        
        q_value_pairs.append(Q_values.copy())
        choices.append(action)
        distributions.append(distribution)
        card_numbers.append(card_number)
        

    return np.array(q_value_pairs), np.array(choices), np.array(predicted_probs), np.array(distributions), np.array(card_numbers)


# this is for the sake of parallel computing
def compute_log_likelihood(alpha, beta, df_all, Q_table):
    Q_init_participant = Q_table.copy()
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, alpha, beta, Q_init=Q_init_participant.copy())
    
    predicted_probs = np.clip(predicted_probs, 1e-6, 1)  # prevent log(0)
    log_likelihood = np.sum(np.log(predicted_probs))
    
    return (alpha, beta, log_likelihood)


In [9]:
num_of_samples = 100
# num_of_samples = 1000
alpha_min = 0.01
alpha_max = 1
beta_min = 0.01
beta_max  = 1
alpha_samples = np.random.uniform(alpha_min, alpha_max + np.finfo(float).eps, num_of_samples)
beta_samples = np.random.uniform(beta_min, beta_max + np.finfo(float).eps, num_of_samples)

In [10]:
BIC_models = []
AIC_models = []
best_alpha_models = []
best_beta_models = []
accuracy_models = []
precision_models = []
sensitivity_recall_models = []
specificity_models = []
f1_score_models = []
mcFadden_r2_models = []
r2_models = []

for idx, df_all in enumerate(dataframes):
    Q_init_participant = Q_table.copy()
    
    df_all = df_all[df_all['outcome'].str.lower() != 'na'].reset_index(drop=True)


    def objective(trial):
        alpha = trial.suggest_float("alpha", alpha_min, alpha_max)
        beta  = trial.suggest_float("beta", beta_min, beta_max)

        # negative log-likelihood (Optuna minimises)
        _, _, ll = compute_log_likelihood(alpha, beta,
                                        df_all,
                                        Q_init_participant.copy())
        return -ll

    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=400, n_jobs=-1)




    best_alpha = study.best_params["alpha"]
    best_beta  = study.best_params["beta"]
    best_log_likelihood = -study.best_value

    # keep this for plotting later
    results_df = study.trials_dataframe()
    results_df["alpha"] = results_df["params_alpha"]
    results_df["beta"]  = results_df["params_beta"]
    results_df["log_likelihood"] = -results_df["value"]

    # model prediction 
    
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, best_alpha, best_beta, Q_init=Q_init_participant.copy())
    
    
    predicted_choices = []
    for trial in range(len(card_numbers)):
        test_action_probs = epsilon_greedy(q_values[trial], best_beta)
        p_arrowup = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowup"]]
        p_arrow_down = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowdown"]]
        # choosing 1 or 0 based on the softmax probabilities:
        predicted_choices.append(np.random.choice([1, 0], p=[p_arrowup, p_arrow_down]))

    # finding out model total reward based on the model's predicted choices
    total_reward = [] 
    for i in range(len(predicted_choices)):
        if len(total_reward)> 0:
            last_reward = total_reward[-1]  #  the last reward value
        else:
            last_reward = 10 # initial reward is $10
        
        if ((df_all.loc[i, 'myCard'] > df_all.loc[i, 'yourCard'] and predicted_choices[i] == 1) or
            (df_all.loc[i, 'myCard'] < df_all.loc[i, 'yourCard'] and predicted_choices[i] == 0)):
            total_reward.append(last_reward + 0.5)
        else:
            total_reward.append(last_reward - 0.5)

    
   
       # confusion matrix:
    conf_matrix = confusion_matrix(choices, predicted_choices)
    TN, FP, FN, TP = conf_matrix.ravel()  # unpacking the confusion matrix
    # acc
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision: From the ones that we’ve announced them as up/down, which ones are really up/down?
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall or sensitivity : true positive rate
    sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # specificity : true negative rate
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    # f1 Score
    f1_score = 2 * (precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0

    
    # bayes information criterion:
    n_trials = len(df_all)
    k = 2  # number of free parameters: alpha and beta
    BIC = k * np.log(n_trials) - 2 * best_log_likelihood # this is BIC formula based on the log lkelihode I found before

    

    # Akaike  information criterion(AIC):
    AIC = 2 * k - 2 * best_log_likelihood 


    # mcFadden r-squared:
    p_null = np.mean(choices)  # probability of choosing "1" in the dataset
    log_likelihood_null = np.sum(choices * np.log(p_null) + (1 - choices) * np.log(1 - p_null))
    mcFadden_r2 = 1 - (best_log_likelihood / log_likelihood_null)

    # r-squared
    r2 = r2_score(choices, predicted_choices)
    
    # saving models evaluation variables:
    best_alpha_models.append(best_alpha)
    best_beta_models.append(best_beta)
    BIC_models.append(BIC)
    AIC_models.append(AIC)
    accuracy_models.append(accuracy)
    precision_models.append(precision)
    sensitivity_recall_models.append(sensitivity_recall)
    specificity_models.append(specificity)
    f1_score_models.append(f1_score)
    mcFadden_r2_models.append(mcFadden_r2)
    r2_models.append(r2)


    ###########################################################################################
    ## visulization
    ###########################################################################################

    fig, axes = plt.subplots(1, 3, figsize=(19, 6))

    plots_smooth_level = 20


#############################################
    # Density Plot (KDE)
    sns.kdeplot(
        x=results_df["alpha"], 
        y=results_df["beta"], 
        fill=True, 
        cmap="viridis", 
        ax=axes[0], 
        bw_adjust=1.8,  # Increase for smoother density
        levels=plots_smooth_level,  # More contour levels
        thresh=0  # Ensure density is plotted across all values
    )
    mappable = axes[0].collections[0]
    cbar = fig.colorbar(mappable, ax=axes[0], label="density", fraction=0.046, pad=0.04)  
    cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))  # 2 decimal places
    cbar.ax.set_ylabel("density", fontsize=12, fontweight='bold')
    cbar.ax.tick_params(labelsize=12)

    axes[0].set_xlim(alpha_min, alpha_max)
    axes[0].set_ylim(beta_min, beta_max)
    axes[0].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[0].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[0].set_title("density of α and ε joint probability", fontsize=16, fontweight='bold')
    axes[0].tick_params(axis='both', labelsize=14)
#############################################


#############################################
    # Log Likelihood 

    
    alpha_step = 0.1
    beta_step = 0.1
    alpha_bins = np.arange(alpha_min, alpha_max+ alpha_step, alpha_step)  # bins from 0 to 1 with step 0.1
    beta_bins = np.arange(beta_min, beta_max + beta_step, beta_step)       # bins from 0 to 1 with step 0.1

    results_df["alpha_binned"] = pd.cut(results_df["alpha"], bins=alpha_bins, labels=alpha_bins[:-1], include_lowest=True)
    results_df["beta_binned"] = pd.cut(results_df["beta"], bins=beta_bins, labels=beta_bins[:-1], include_lowest=True)

    heatmap_data = results_df.groupby(
    ["beta_binned", "alpha_binned"], observed=False)["log_likelihood"].mean().unstack()

    heatmap_data.index = heatmap_data.index.astype(float)
    heatmap_data.columns = heatmap_data.columns.astype(float)

    sns.heatmap(
        heatmap_data, 
        cmap="Blues", 
        cbar=True,
        ax=axes[1]
    )
    axes[1].set_xticks(np.arange(len(heatmap_data.columns)))  
    axes[1].set_xticklabels([f"{x:.1f}" for x in heatmap_data.columns], rotation=45)

    axes[1].set_yticks(np.arange(len(heatmap_data.index))) 
    axes[1].set_yticklabels([f"{x:.1f}" for x in heatmap_data.index]) 


    axes[1].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[1].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[1].set_title("log likelihood for combinations of α and ε", fontsize=16, fontweight='bold')
    axes[1].tick_params(axis='both', labelsize=14)
    axes[1].invert_yaxis()


    

#############################################


#############################################
    # Confusion Matrix
    heatmap_cmap_color = mcolors.LinearSegmentedColormap.from_list("warm_red", ["#fff5e6", "#ff5733"])
    sns.heatmap(
        conf_matrix, annot=True, fmt="d", cmap=heatmap_cmap_color,
        xticklabels=["arrowdown", "arrowup"], 
        yticklabels=["arrowdown", "arrowup"], 
        ax=axes[2], 
        cbar=False
    )

    axes[2].set_xlabel("prediction", fontsize=14, fontweight='bold')
    axes[2].set_ylabel("true label", fontsize=14, fontweight='bold')
    axes[2].set_title(f"confusion matrix (α={best_alpha:.2f}, ε={best_beta:.2f})", fontsize=16, fontweight='bold')
    axes[2].tick_params(axis='both', labelsize=14)


#############################################
    # saving figures
    plt.tight_layout(rect=[0, 0, 1, 0.9]) 
    fig.suptitle(f'participant {idx}', fontsize=18, fontweight='bold', y=0.95)

    filename = os.path.join(output_dir_plots, f"plot_{participants[idx]}.pdf")
    plt.savefig(filename, format='pdf')
    plt.close(fig)

    print(f"saved: {filename}")


#############################################

    # saving model behavior
    q_values_reshaped = [q_values[i].tolist() for i in range(n_trials)]  # convert each (9,3,2) array into a list format

    df_model_behavior = pd.DataFrame({
        "model_choices": predicted_choices,
        "participant_choices": choices,
        "model_total_reward": total_reward,
        "participant_total_reward": df_all["totalReward"],
        "q_val": q_values_reshaped  
    })

    file_path = os.path.join(output_dir_model_behavior, f"model_behavior_{participants[idx]}.csv")
    df_model_behavior.to_csv(file_path, index=False)



[I 2025-06-12 14:18:21,868] A new study created in memory with name: no-name-9b41bae4-3984-4542-9c8e-82989ac5dc43
[I 2025-06-12 14:18:23,198] Trial 9 finished with value: 166.76880162464465 and parameters: {'alpha': 0.800537582670248, 'beta': 0.8149601955297788}. Best is trial 9 with value: 166.76880162464465.
[I 2025-06-12 14:18:23,215] Trial 7 finished with value: 154.80609168114967 and parameters: {'alpha': 0.771159546160856, 'beta': 0.6256220430062562}. Best is trial 7 with value: 154.80609168114967.
[I 2025-06-12 14:18:23,219] Trial 5 finished with value: 147.9325242967474 and parameters: {'alpha': 0.10307659871522046, 'beta': 0.32455596697563305}. Best is trial 5 with value: 147.9325242967474.
[I 2025-06-12 14:18:23,224] Trial 0 finished with value: 170.73290386659818 and parameters: {'alpha': 0.11503544726303763, 'beta': 0.17233586809299875}. Best is trial 5 with value: 147.9325242967474.
[I 2025-06-12 14:18:23,227] Trial 11 finished with value: 144.35681169268284 and parameters

saved: 13_RL_agent_TDlearn_output\plots\plot_07_11_2024_17_23_43.pdf


[I 2025-06-12 14:18:47,250] Trial 2 finished with value: 172.61046842349577 and parameters: {'alpha': 0.9321805166716111, 'beta': 0.8563967552597215}. Best is trial 2 with value: 172.61046842349577.
[I 2025-06-12 14:18:47,513] Trial 9 finished with value: 162.72821015209962 and parameters: {'alpha': 0.8253600859845732, 'beta': 0.6997403729973446}. Best is trial 9 with value: 162.72821015209962.
[I 2025-06-12 14:18:47,547] Trial 20 finished with value: 170.64695629666699 and parameters: {'alpha': 0.5396105582929959, 'beta': 0.3268832323690445}. Best is trial 9 with value: 162.72821015209962.
[I 2025-06-12 14:18:47,641] Trial 12 finished with value: 141.28805655341836 and parameters: {'alpha': 0.3398388122143727, 'beta': 0.5302980994272226}. Best is trial 12 with value: 141.28805655341836.
[I 2025-06-12 14:18:47,653] Trial 1 finished with value: 169.3598984153187 and parameters: {'alpha': 0.35210975772797415, 'beta': 0.870356218719122}. Best is trial 12 with value: 141.28805655341836.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_08_11_2024_13_03_29.pdf


[I 2025-06-12 14:19:11,989] Trial 1 finished with value: 173.67856882178864 and parameters: {'alpha': 0.39110721846389185, 'beta': 0.9163657597388989}. Best is trial 1 with value: 173.67856882178864.
[I 2025-06-12 14:19:12,036] Trial 8 finished with value: 168.31292440347792 and parameters: {'alpha': 0.3185942011954505, 'beta': 0.8778747011375442}. Best is trial 8 with value: 168.31292440347792.
[I 2025-06-12 14:19:12,063] Trial 10 finished with value: 141.924782600421 and parameters: {'alpha': 0.30155476164152545, 'beta': 0.6498844452323957}. Best is trial 10 with value: 141.924782600421.
[I 2025-06-12 14:19:12,232] Trial 0 finished with value: 167.86670453136395 and parameters: {'alpha': 0.4594309363072611, 'beta': 0.11263045756902684}. Best is trial 10 with value: 141.924782600421.
[I 2025-06-12 14:19:12,236] Trial 3 finished with value: 137.17470275326377 and parameters: {'alpha': 0.4651825678430042, 'beta': 0.4670112029421565}. Best is trial 3 with value: 137.17470275326377.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_11_11_2024_16_46_44.pdf


[I 2025-06-12 14:19:35,851] Trial 4 finished with value: 164.6646651170683 and parameters: {'alpha': 0.8835274709174855, 'beta': 0.30562531254345854}. Best is trial 4 with value: 164.6646651170683.
[I 2025-06-12 14:19:35,858] Trial 29 finished with value: 163.09237905080874 and parameters: {'alpha': 0.16603633771721324, 'beta': 0.1855929265349744}. Best is trial 29 with value: 163.09237905080874.
[I 2025-06-12 14:19:35,862] Trial 3 finished with value: 162.20914993467994 and parameters: {'alpha': 0.19453039126146238, 'beta': 0.7954075677456842}. Best is trial 3 with value: 162.20914993467994.
[I 2025-06-12 14:19:35,892] Trial 12 finished with value: 188.69203078294976 and parameters: {'alpha': 0.18514245428992934, 'beta': 0.10381320691633204}. Best is trial 3 with value: 162.20914993467994.
[I 2025-06-12 14:19:35,943] Trial 14 finished with value: 159.29581565710606 and parameters: {'alpha': 0.5669923344264006, 'beta': 0.6934262542696029}. Best is trial 14 with value: 159.2958156571060

saved: 13_RL_agent_TDlearn_output\plots\plot_12_11_2024_00_15_17.pdf


[I 2025-06-12 14:19:59,747] Trial 0 finished with value: 150.22464582620518 and parameters: {'alpha': 0.492177911327974, 'beta': 0.641973586469433}. Best is trial 0 with value: 150.22464582620518.
[I 2025-06-12 14:19:59,760] Trial 5 finished with value: 154.65035006459357 and parameters: {'alpha': 0.9228828866261951, 'beta': 0.6994188550004377}. Best is trial 0 with value: 150.22464582620518.
[I 2025-06-12 14:19:59,791] Trial 7 finished with value: 134.82433378863223 and parameters: {'alpha': 0.23763239669194533, 'beta': 0.28513003689442523}. Best is trial 7 with value: 134.82433378863223.
[I 2025-06-12 14:19:59,824] Trial 3 finished with value: 167.10592190622182 and parameters: {'alpha': 0.49051054706796393, 'beta': 0.8487476342458629}. Best is trial 7 with value: 134.82433378863223.
[I 2025-06-12 14:19:59,830] Trial 1 finished with value: 148.8422506390471 and parameters: {'alpha': 0.42070976058195725, 'beta': 0.25066966593107504}. Best is trial 7 with value: 134.82433378863223.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_10_46_21.pdf


[I 2025-06-12 14:20:22,501] Trial 1 finished with value: 203.2528101529433 and parameters: {'alpha': 0.82583609016038, 'beta': 0.25062471844470036}. Best is trial 1 with value: 203.2528101529433.
[I 2025-06-12 14:20:22,551] Trial 21 finished with value: 180.3928820815665 and parameters: {'alpha': 0.7050136534557458, 'beta': 0.9230169012946691}. Best is trial 21 with value: 180.3928820815665.
[I 2025-06-12 14:20:22,553] Trial 2 finished with value: 227.91767732390673 and parameters: {'alpha': 0.7264232816883183, 'beta': 0.17105565243051232}. Best is trial 21 with value: 180.3928820815665.
[I 2025-06-12 14:20:22,564] Trial 9 finished with value: 189.50643392728517 and parameters: {'alpha': 0.1518645433554254, 'beta': 0.24194899225778344}. Best is trial 21 with value: 180.3928820815665.
[I 2025-06-12 14:20:22,567] Trial 8 finished with value: 182.77085160913686 and parameters: {'alpha': 0.5991212176414433, 'beta': 0.9522032728303039}. Best is trial 21 with value: 180.3928820815665.
[I 202

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_14_45_52.pdf


[I 2025-06-12 14:20:45,232] Trial 0 finished with value: 138.70860578420258 and parameters: {'alpha': 0.32807203672267576, 'beta': 0.35219607716584783}. Best is trial 0 with value: 138.70860578420258.
[I 2025-06-12 14:20:45,234] Trial 7 finished with value: 151.89377107072033 and parameters: {'alpha': 0.8871362717976851, 'beta': 0.3961108608045486}. Best is trial 0 with value: 138.70860578420258.
[I 2025-06-12 14:20:45,239] Trial 2 finished with value: 150.97504053357673 and parameters: {'alpha': 0.699479809915743, 'beta': 0.5576315648462585}. Best is trial 0 with value: 138.70860578420258.
[I 2025-06-12 14:20:45,241] Trial 24 finished with value: 176.50018795887186 and parameters: {'alpha': 0.5646879614304674, 'beta': 0.1867162926666752}. Best is trial 0 with value: 138.70860578420258.
[I 2025-06-12 14:20:45,247] Trial 1 finished with value: 155.678810235425 and parameters: {'alpha': 0.6153045076674173, 'beta': 0.3303427587204092}. Best is trial 0 with value: 138.70860578420258.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_14_11_2024_21_46_47.pdf


[I 2025-06-12 14:21:07,699] Trial 17 finished with value: 186.5129280788974 and parameters: {'alpha': 0.5158277594294862, 'beta': 0.9943765516650593}. Best is trial 17 with value: 186.5129280788974.
[I 2025-06-12 14:21:07,770] Trial 0 finished with value: 170.97000441989394 and parameters: {'alpha': 0.46755672773831325, 'beta': 0.8315843165474317}. Best is trial 0 with value: 170.97000441989394.
[I 2025-06-12 14:21:07,772] Trial 2 finished with value: 174.75366489100472 and parameters: {'alpha': 0.7927717089311174, 'beta': 0.33341174228366977}. Best is trial 0 with value: 170.97000441989394.
[I 2025-06-12 14:21:07,780] Trial 1 finished with value: 151.431472259704 and parameters: {'alpha': 0.08915025444628039, 'beta': 0.5753292135570872}. Best is trial 1 with value: 151.431472259704.
[I 2025-06-12 14:21:07,783] Trial 11 finished with value: 215.8305295131617 and parameters: {'alpha': 0.7842400039936749, 'beta': 0.1528668344580999}. Best is trial 1 with value: 151.431472259704.
[I 2025-

saved: 13_RL_agent_TDlearn_output\plots\plot_15_11_2024_11_43_48.pdf


[I 2025-06-12 14:21:30,555] Trial 4 finished with value: 169.94166294180707 and parameters: {'alpha': 0.8039922483170237, 'beta': 0.831483737020885}. Best is trial 4 with value: 169.94166294180707.
[I 2025-06-12 14:21:30,654] Trial 0 finished with value: 166.4094785851603 and parameters: {'alpha': 0.9957422101568892, 'beta': 0.7843208731190409}. Best is trial 0 with value: 166.4094785851603.
[I 2025-06-12 14:21:30,669] Trial 18 finished with value: 158.44949319301708 and parameters: {'alpha': 0.5354469395194053, 'beta': 0.6139295511135864}. Best is trial 18 with value: 158.44949319301708.
[I 2025-06-12 14:21:30,677] Trial 1 finished with value: 201.93089810189684 and parameters: {'alpha': 0.8619783492640135, 'beta': 0.15681660754405288}. Best is trial 18 with value: 158.44949319301708.
[I 2025-06-12 14:21:30,680] Trial 8 finished with value: 155.74399293554916 and parameters: {'alpha': 0.16141920769836057, 'beta': 0.3512711697423372}. Best is trial 8 with value: 155.74399293554916.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_15_25_39.pdf


[I 2025-06-12 14:21:55,793] Trial 17 finished with value: 163.1455525007783 and parameters: {'alpha': 0.37999650174848915, 'beta': 0.6506643831019489}. Best is trial 17 with value: 163.1455525007783.
[I 2025-06-12 14:21:55,807] Trial 25 finished with value: 171.3821246788852 and parameters: {'alpha': 0.6751167883893219, 'beta': 0.4514196305449127}. Best is trial 17 with value: 163.1455525007783.
[I 2025-06-12 14:21:55,822] Trial 10 finished with value: 176.61956334540312 and parameters: {'alpha': 0.40874835770456136, 'beta': 0.3313270188843436}. Best is trial 17 with value: 163.1455525007783.
[I 2025-06-12 14:21:55,825] Trial 0 finished with value: 182.2510328255388 and parameters: {'alpha': 0.4774409175398827, 'beta': 0.9527595863310061}. Best is trial 17 with value: 163.1455525007783.
[I 2025-06-12 14:21:55,834] Trial 14 finished with value: 169.44293463195652 and parameters: {'alpha': 0.668600309605147, 'beta': 0.753638542933074}. Best is trial 17 with value: 163.1455525007783.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_23_57_47.pdf


[I 2025-06-12 14:22:21,254] Trial 29 finished with value: 156.6415030591963 and parameters: {'alpha': 0.7890921638326965, 'beta': 0.22173866070855347}. Best is trial 29 with value: 156.6415030591963.
[I 2025-06-12 14:22:21,288] Trial 13 finished with value: 144.23696809554238 and parameters: {'alpha': 0.1740648300696114, 'beta': 0.5265525930942503}. Best is trial 13 with value: 144.23696809554238.
[I 2025-06-12 14:22:21,483] Trial 1 finished with value: 146.49128271792077 and parameters: {'alpha': 0.22048062147745776, 'beta': 0.618281842603813}. Best is trial 13 with value: 144.23696809554238.
[I 2025-06-12 14:22:21,494] Trial 7 finished with value: 145.68100292349783 and parameters: {'alpha': 0.6834437184341297, 'beta': 0.5679601777611317}. Best is trial 13 with value: 144.23696809554238.
[I 2025-06-12 14:22:21,497] Trial 4 finished with value: 144.68934981278557 and parameters: {'alpha': 0.595351861900321, 'beta': 0.356790437042626}. Best is trial 13 with value: 144.23696809554238.
[

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_13_12_31.pdf


[I 2025-06-12 14:22:47,042] Trial 19 finished with value: 155.98960704349213 and parameters: {'alpha': 0.46288250947343396, 'beta': 0.4770744650153865}. Best is trial 19 with value: 155.98960704349213.
[I 2025-06-12 14:22:47,045] Trial 15 finished with value: 160.6096631328718 and parameters: {'alpha': 0.6424539494034068, 'beta': 0.4778608550110345}. Best is trial 19 with value: 155.98960704349213.
[I 2025-06-12 14:22:47,046] Trial 10 finished with value: 241.47387828005722 and parameters: {'alpha': 0.841875704127575, 'beta': 0.09011858548381856}. Best is trial 19 with value: 155.98960704349213.
[I 2025-06-12 14:22:47,052] Trial 13 finished with value: 186.63220173443395 and parameters: {'alpha': 0.8472558499602025, 'beta': 0.9956660878191167}. Best is trial 19 with value: 155.98960704349213.
[I 2025-06-12 14:22:47,059] Trial 28 finished with value: 156.57101163123514 and parameters: {'alpha': 0.22119273625817132, 'beta': 0.6789319291782198}. Best is trial 19 with value: 155.9896070434

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_20_59_56.pdf


[I 2025-06-12 14:23:12,136] Trial 6 finished with value: 170.71155480100805 and parameters: {'alpha': 0.2866528753594995, 'beta': 0.8446233369511361}. Best is trial 6 with value: 170.71155480100805.
[I 2025-06-12 14:23:12,149] Trial 16 finished with value: 162.5109120576873 and parameters: {'alpha': 0.5669535504270728, 'beta': 0.6544188321121264}. Best is trial 16 with value: 162.5109120576873.
[I 2025-06-12 14:23:12,161] Trial 30 finished with value: 179.55153735485337 and parameters: {'alpha': 0.4445634027492504, 'beta': 0.9328148948389043}. Best is trial 16 with value: 162.5109120576873.
[I 2025-06-12 14:23:12,402] Trial 8 finished with value: 166.98458399764613 and parameters: {'alpha': 0.7053103887101689, 'beta': 0.4001230591647161}. Best is trial 16 with value: 162.5109120576873.
[I 2025-06-12 14:23:12,442] Trial 5 finished with value: 161.82092156640442 and parameters: {'alpha': 0.1715202765009808, 'beta': 0.7281600781583213}. Best is trial 5 with value: 161.82092156640442.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_13_31_43.pdf


[I 2025-06-12 14:23:37,436] Trial 28 finished with value: 162.09376330386948 and parameters: {'alpha': 0.7561242563202467, 'beta': 0.3686846224747657}. Best is trial 28 with value: 162.09376330386948.
[I 2025-06-12 14:23:37,452] Trial 10 finished with value: 156.8887845810682 and parameters: {'alpha': 0.6709343949799212, 'beta': 0.49157557653027606}. Best is trial 10 with value: 156.8887845810682.
[I 2025-06-12 14:23:37,498] Trial 6 finished with value: 162.66982932996592 and parameters: {'alpha': 0.032583847477895264, 'beta': 0.42643554858048477}. Best is trial 10 with value: 156.8887845810682.
[I 2025-06-12 14:23:37,638] Trial 9 finished with value: 150.2671440359893 and parameters: {'alpha': 0.3313921658096631, 'beta': 0.513342015511687}. Best is trial 9 with value: 150.2671440359893.
[I 2025-06-12 14:23:37,665] Trial 1 finished with value: 156.60202720749368 and parameters: {'alpha': 0.8282122404317928, 'beta': 0.5454092714429031}. Best is trial 9 with value: 150.2671440359893.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_15_43_17.pdf


[I 2025-06-12 14:24:03,125] Trial 12 finished with value: 176.81127768164225 and parameters: {'alpha': 0.07861414317313989, 'beta': 0.11139805951350723}. Best is trial 12 with value: 176.81127768164225.
[I 2025-06-12 14:24:03,153] Trial 1 finished with value: 159.62218507123598 and parameters: {'alpha': 0.7261287125952326, 'beta': 0.43644033920466235}. Best is trial 1 with value: 159.62218507123598.
[I 2025-06-12 14:24:03,160] Trial 27 finished with value: 147.22081916668822 and parameters: {'alpha': 0.3535132672575182, 'beta': 0.5213952588254214}. Best is trial 27 with value: 147.22081916668822.
[I 2025-06-12 14:24:03,167] Trial 31 finished with value: 161.72335960716612 and parameters: {'alpha': 0.5057755605985741, 'beta': 0.37380205816301937}. Best is trial 27 with value: 147.22081916668822.
[I 2025-06-12 14:24:03,210] Trial 0 finished with value: 145.0239961278076 and parameters: {'alpha': 0.1806056530134701, 'beta': 0.3489262354188726}. Best is trial 0 with value: 145.023996127807

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_14_28_20.pdf


[I 2025-06-12 14:24:28,805] Trial 10 finished with value: 140.0080792677221 and parameters: {'alpha': 0.05895483365377766, 'beta': 0.3601929603333086}. Best is trial 10 with value: 140.0080792677221.
[I 2025-06-12 14:24:28,861] Trial 27 finished with value: 145.40277120274385 and parameters: {'alpha': 0.07749848107038822, 'beta': 0.6136473217278526}. Best is trial 10 with value: 140.0080792677221.
[I 2025-06-12 14:24:28,822] Trial 11 finished with value: 172.32743763947087 and parameters: {'alpha': 0.8298435374400961, 'beta': 0.20535799432494645}. Best is trial 10 with value: 140.0080792677221.
[I 2025-06-12 14:24:28,823] Trial 15 finished with value: 166.98723620212334 and parameters: {'alpha': 0.3532648594849024, 'beta': 0.8503523856142822}. Best is trial 10 with value: 140.0080792677221.
[I 2025-06-12 14:24:28,843] Trial 1 finished with value: 270.73347577646075 and parameters: {'alpha': 0.7063133566081893, 'beta': 0.0349297683881419}. Best is trial 10 with value: 140.0080792677221.

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_17_03_01.pdf


[I 2025-06-12 14:24:54,216] Trial 6 finished with value: 157.89631324634234 and parameters: {'alpha': 0.5068823273390216, 'beta': 0.7113652656779517}. Best is trial 6 with value: 157.89631324634234.
[I 2025-06-12 14:24:54,296] Trial 13 finished with value: 194.813841451617 and parameters: {'alpha': 0.8113144419761756, 'beta': 0.12825667636772467}. Best is trial 6 with value: 157.89631324634234.
[I 2025-06-12 14:24:54,324] Trial 0 finished with value: 150.230033889605 and parameters: {'alpha': 0.35015336973687655, 'beta': 0.6829919422433589}. Best is trial 0 with value: 150.230033889605.
[I 2025-06-12 14:24:54,327] Trial 23 finished with value: 164.70573146188187 and parameters: {'alpha': 0.5765811159377222, 'beta': 0.8002526639582478}. Best is trial 0 with value: 150.230033889605.
[I 2025-06-12 14:24:54,382] Trial 27 finished with value: 160.3046914171565 and parameters: {'alpha': 0.45333439911931617, 'beta': 0.2381734955225056}. Best is trial 0 with value: 150.230033889605.
[I 2025-06

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_19_42_32.pdf


[I 2025-06-12 14:25:19,746] Trial 13 finished with value: 250.5675585346223 and parameters: {'alpha': 0.6531470773255212, 'beta': 0.07488308862210723}. Best is trial 13 with value: 250.5675585346223.
[I 2025-06-12 14:25:19,759] Trial 7 finished with value: 182.93394512539396 and parameters: {'alpha': 0.9045097604143704, 'beta': 0.9640269656081689}. Best is trial 7 with value: 182.93394512539396.
[I 2025-06-12 14:25:19,857] Trial 0 finished with value: 287.395434655665 and parameters: {'alpha': 0.8373893803190374, 'beta': 0.04362328676120965}. Best is trial 7 with value: 182.93394512539396.
[I 2025-06-12 14:25:19,907] Trial 2 finished with value: 149.0494212576839 and parameters: {'alpha': 0.3440244130785689, 'beta': 0.4938024645754164}. Best is trial 2 with value: 149.0494212576839.
[I 2025-06-12 14:25:20,003] Trial 4 finished with value: 158.68597031832718 and parameters: {'alpha': 0.8715530110825547, 'beta': 0.5214649759147681}. Best is trial 2 with value: 149.0494212576839.
[I 2025-

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_09_23_29.pdf


[I 2025-06-12 14:25:45,193] Trial 0 finished with value: 168.08291366220982 and parameters: {'alpha': 0.7899828479505967, 'beta': 0.2646413050179981}. Best is trial 0 with value: 168.08291366220982.
[I 2025-06-12 14:25:45,384] Trial 18 finished with value: 178.28339008464778 and parameters: {'alpha': 0.9069373954943882, 'beta': 0.927579684166602}. Best is trial 0 with value: 168.08291366220982.
[I 2025-06-12 14:25:45,414] Trial 31 finished with value: 160.50494124150097 and parameters: {'alpha': 0.29393107049005174, 'beta': 0.8216426060167301}. Best is trial 31 with value: 160.50494124150097.
[I 2025-06-12 14:25:45,435] Trial 26 finished with value: 147.96156615514053 and parameters: {'alpha': 0.10555119058481442, 'beta': 0.750907510215744}. Best is trial 26 with value: 147.96156615514053.
[I 2025-06-12 14:25:45,712] Trial 15 finished with value: 155.70914206673285 and parameters: {'alpha': 0.5973713810171615, 'beta': 0.6288092996876989}. Best is trial 26 with value: 147.96156615514053

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_14_51_17.pdf


[I 2025-06-12 14:26:11,362] Trial 26 finished with value: 151.9915239210277 and parameters: {'alpha': 0.1951994541768973, 'beta': 0.393669569820789}. Best is trial 26 with value: 151.9915239210277.
[I 2025-06-12 14:26:11,383] Trial 27 finished with value: 218.8319264060769 and parameters: {'alpha': 0.9243560200400457, 'beta': 0.1019181746660979}. Best is trial 26 with value: 151.9915239210277.
[I 2025-06-12 14:26:11,427] Trial 4 finished with value: 188.2862855908787 and parameters: {'alpha': 0.6874543435828582, 'beta': 0.17692703380011118}. Best is trial 26 with value: 151.9915239210277.
[I 2025-06-12 14:26:11,500] Trial 9 finished with value: 195.3105919776511 and parameters: {'alpha': 0.041641179451535525, 'beta': 0.16925850092321912}. Best is trial 26 with value: 151.9915239210277.
[I 2025-06-12 14:26:11,514] Trial 18 finished with value: 160.02002848250584 and parameters: {'alpha': 0.5109604399450733, 'beta': 0.706866792793568}. Best is trial 26 with value: 151.9915239210277.
[I 2

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_14_56.pdf


[I 2025-06-12 14:26:37,033] Trial 0 finished with value: 163.21411096118476 and parameters: {'alpha': 0.512458342371096, 'beta': 0.3363899920266575}. Best is trial 0 with value: 163.21411096118476.
[I 2025-06-12 14:26:37,035] Trial 3 finished with value: 159.01571117376525 and parameters: {'alpha': 0.5992711889599144, 'beta': 0.3937899041620347}. Best is trial 3 with value: 159.01571117376525.
[I 2025-06-12 14:26:37,038] Trial 13 finished with value: 163.95232685112276 and parameters: {'alpha': 0.6849190073521282, 'beta': 0.7618535344330208}. Best is trial 3 with value: 159.01571117376525.
[I 2025-06-12 14:26:37,041] Trial 21 finished with value: 150.21401534508942 and parameters: {'alpha': 0.1544947307109758, 'beta': 0.2936800978679474}. Best is trial 21 with value: 150.21401534508942.
[I 2025-06-12 14:26:37,069] Trial 17 finished with value: 139.5740194879999 and parameters: {'alpha': 0.031234787018022542, 'beta': 0.537734624403704}. Best is trial 17 with value: 139.5740194879999.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_41_35.pdf


[I 2025-06-12 14:27:02,286] Trial 1 finished with value: 172.0078732111591 and parameters: {'alpha': 0.6170787323008553, 'beta': 0.7918992007055656}. Best is trial 1 with value: 172.0078732111591.
[I 2025-06-12 14:27:02,303] Trial 26 finished with value: 170.11039407509801 and parameters: {'alpha': 0.1707396632969784, 'beta': 0.59390957660193}. Best is trial 26 with value: 170.11039407509801.
[I 2025-06-12 14:27:02,309] Trial 11 finished with value: 235.21100449243534 and parameters: {'alpha': 0.5430316014798853, 'beta': 0.14868836762452883}. Best is trial 26 with value: 170.11039407509801.
[I 2025-06-12 14:27:02,311] Trial 15 finished with value: 170.94187457234744 and parameters: {'alpha': 0.10121940475290339, 'beta': 0.5573019995175874}. Best is trial 26 with value: 170.11039407509801.
[I 2025-06-12 14:27:02,313] Trial 10 finished with value: 179.85461265763723 and parameters: {'alpha': 0.33452690536921437, 'beta': 0.9137154670465071}. Best is trial 26 with value: 170.11039407509801

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_16_58_23.pdf


[I 2025-06-12 14:27:28,154] Trial 19 finished with value: 147.39274779650248 and parameters: {'alpha': 0.2614349676542593, 'beta': 0.5297715776262607}. Best is trial 19 with value: 147.39274779650248.
[I 2025-06-12 14:27:28,177] Trial 28 finished with value: 162.7323959500502 and parameters: {'alpha': 0.9482277705615675, 'beta': 0.6626391208131013}. Best is trial 19 with value: 147.39274779650248.
[I 2025-06-12 14:27:28,181] Trial 29 finished with value: 202.6438623172741 and parameters: {'alpha': 0.7815673554154469, 'beta': 0.1830494416453355}. Best is trial 19 with value: 147.39274779650248.
[I 2025-06-12 14:27:28,187] Trial 30 finished with value: 162.3875380669174 and parameters: {'alpha': 0.5106693946633644, 'beta': 0.6494965432646344}. Best is trial 19 with value: 147.39274779650248.
[I 2025-06-12 14:27:28,188] Trial 14 finished with value: 165.44544750751157 and parameters: {'alpha': 0.9254640294901421, 'beta': 0.7341088337977402}. Best is trial 19 with value: 147.39274779650248

saved: 13_RL_agent_TDlearn_output\plots\plot_22_03_2025_00_10_37.pdf


[I 2025-06-12 14:27:53,569] Trial 13 finished with value: 165.04806199200772 and parameters: {'alpha': 0.8955921049051081, 'beta': 0.8469293250145722}. Best is trial 13 with value: 165.04806199200772.
[I 2025-06-12 14:27:53,590] Trial 11 finished with value: 148.87483233948925 and parameters: {'alpha': 0.8266637637350317, 'beta': 0.6972974388003842}. Best is trial 11 with value: 148.87483233948925.
[I 2025-06-12 14:27:53,604] Trial 15 finished with value: 137.3714605530069 and parameters: {'alpha': 0.5983778761595432, 'beta': 0.5336373362427798}. Best is trial 15 with value: 137.3714605530069.
[I 2025-06-12 14:27:53,624] Trial 14 finished with value: 135.7238808877068 and parameters: {'alpha': 0.3994352692775213, 'beta': 0.16964733960859338}. Best is trial 14 with value: 135.7238808877068.
[I 2025-06-12 14:27:53,802] Trial 4 finished with value: 117.70451113847503 and parameters: {'alpha': 0.2534391541048808, 'beta': 0.37694296450070375}. Best is trial 4 with value: 117.70451113847503.

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_12_34_30.pdf


[I 2025-06-12 14:28:19,784] Trial 7 finished with value: 186.4599683098184 and parameters: {'alpha': 0.9958448790390414, 'beta': 0.9944032912373659}. Best is trial 7 with value: 186.4599683098184.
[I 2025-06-12 14:28:19,809] Trial 25 finished with value: 171.87407486397774 and parameters: {'alpha': 0.5235073956125018, 'beta': 0.8545765549004407}. Best is trial 25 with value: 171.87407486397774.
[I 2025-06-12 14:28:19,812] Trial 29 finished with value: 158.45998403038783 and parameters: {'alpha': 0.7032674241991245, 'beta': 0.6143800149121261}. Best is trial 29 with value: 158.45998403038783.
[I 2025-06-12 14:28:19,916] Trial 12 finished with value: 178.56569457327754 and parameters: {'alpha': 0.9688800404222261, 'beta': 0.24745436801602425}. Best is trial 29 with value: 158.45998403038783.
[I 2025-06-12 14:28:20,018] Trial 15 finished with value: 167.76766008664697 and parameters: {'alpha': 0.5641236659578557, 'beta': 0.32327037885064586}. Best is trial 29 with value: 158.4599840303878

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_14_36_42.pdf


[I 2025-06-12 14:28:45,048] Trial 13 finished with value: 162.5992038666579 and parameters: {'alpha': 0.10790735031749506, 'beta': 0.8109913940521574}. Best is trial 13 with value: 162.5992038666579.
[I 2025-06-12 14:28:45,096] Trial 2 finished with value: 171.45871636512192 and parameters: {'alpha': 0.6921528438971641, 'beta': 0.8774717589628177}. Best is trial 13 with value: 162.5992038666579.
[I 2025-06-12 14:28:45,223] Trial 15 finished with value: 149.25231522270064 and parameters: {'alpha': 0.45004513611785213, 'beta': 0.3915695865071551}. Best is trial 15 with value: 149.25231522270064.
[I 2025-06-12 14:28:45,335] Trial 10 finished with value: 152.33107502552576 and parameters: {'alpha': 0.6849760263108307, 'beta': 0.30980678450996996}. Best is trial 15 with value: 149.25231522270064.
[I 2025-06-12 14:28:45,497] Trial 7 finished with value: 159.8662800793993 and parameters: {'alpha': 0.09599453149466708, 'beta': 0.7846234423138582}. Best is trial 15 with value: 149.2523152227006

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_15_19_47.pdf


[I 2025-06-12 14:29:10,993] Trial 29 finished with value: 164.79790995208214 and parameters: {'alpha': 0.761944188418593, 'beta': 0.2350746373469851}. Best is trial 29 with value: 164.79790995208214.
[I 2025-06-12 14:29:11,021] Trial 10 finished with value: 142.67227908241398 and parameters: {'alpha': 0.15160054024440184, 'beta': 0.33126546802875867}. Best is trial 13 with value: 141.92205015313718.
[I 2025-06-12 14:29:11,010] Trial 13 finished with value: 141.92205015313718 and parameters: {'alpha': 0.09173125509072136, 'beta': 0.4663732829327194}. Best is trial 13 with value: 141.92205015313718.
[I 2025-06-12 14:29:11,224] Trial 8 finished with value: 157.97656671260128 and parameters: {'alpha': 0.7522787300165406, 'beta': 0.2867236063135851}. Best is trial 13 with value: 141.92205015313718.
[I 2025-06-12 14:29:11,255] Trial 4 finished with value: 161.00027065300594 and parameters: {'alpha': 0.8340300427913089, 'beta': 0.7613752185024383}. Best is trial 13 with value: 141.92205015313

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_07_37_11.pdf


[I 2025-06-12 14:29:37,491] Trial 4 finished with value: 167.69920515761555 and parameters: {'alpha': 0.040267724530805876, 'beta': 0.8125657415316369}. Best is trial 4 with value: 167.69920515761555.
[I 2025-06-12 14:29:37,525] Trial 0 finished with value: 178.55686044387193 and parameters: {'alpha': 0.7133882886798486, 'beta': 0.9183144010245717}. Best is trial 4 with value: 167.69920515761555.
[I 2025-06-12 14:29:37,740] Trial 29 finished with value: 167.17962669478152 and parameters: {'alpha': 0.289116361358492, 'beta': 0.800841759060401}. Best is trial 29 with value: 167.17962669478152.
[I 2025-06-12 14:29:37,757] Trial 30 finished with value: 291.1790194085455 and parameters: {'alpha': 0.16024211790662315, 'beta': 0.030755238648569244}. Best is trial 28 with value: 153.49993472084964.
[I 2025-06-12 14:29:37,743] Trial 16 finished with value: 189.8079785462823 and parameters: {'alpha': 0.9392995656344371, 'beta': 0.23136958799097895}. Best is trial 13 with value: 161.6368535847997

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_12_11_10.pdf


[I 2025-06-12 14:30:03,415] Trial 4 finished with value: 170.94005162991692 and parameters: {'alpha': 0.3371406872261487, 'beta': 0.7873823464716111}. Best is trial 4 with value: 170.94005162991692.
[I 2025-06-12 14:30:03,419] Trial 0 finished with value: 351.27636738108214 and parameters: {'alpha': 0.11202887935516807, 'beta': 0.027000972714219293}. Best is trial 4 with value: 170.94005162991692.
[I 2025-06-12 14:30:03,434] Trial 13 finished with value: 181.41754865282232 and parameters: {'alpha': 0.7801292873537553, 'beta': 0.3687684651638728}. Best is trial 4 with value: 170.94005162991692.
[I 2025-06-12 14:30:03,437] Trial 11 finished with value: 169.2823639989021 and parameters: {'alpha': 0.28408977637454963, 'beta': 0.7615876776063674}. Best is trial 11 with value: 169.2823639989021.
[I 2025-06-12 14:30:03,554] Trial 31 finished with value: 185.15056384399728 and parameters: {'alpha': 0.2897394099418003, 'beta': 0.3060491896162738}. Best is trial 11 with value: 169.2823639989021.

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_18_41_38.pdf


[I 2025-06-12 14:30:29,028] Trial 4 finished with value: 185.5331856694767 and parameters: {'alpha': 0.21988016819082437, 'beta': 0.24198226901602773}. Best is trial 4 with value: 185.5331856694767.
[I 2025-06-12 14:30:29,081] Trial 15 finished with value: 165.08390126111655 and parameters: {'alpha': 0.7817882279287753, 'beta': 0.673341165599069}. Best is trial 15 with value: 165.08390126111655.
[I 2025-06-12 14:30:29,155] Trial 10 finished with value: 179.18496555810844 and parameters: {'alpha': 0.6525331066538339, 'beta': 0.9198571787937541}. Best is trial 15 with value: 165.08390126111655.
[I 2025-06-12 14:30:29,217] Trial 0 finished with value: 179.50602095663663 and parameters: {'alpha': 0.33519065015107863, 'beta': 0.2977917522660562}. Best is trial 15 with value: 165.08390126111655.
[I 2025-06-12 14:30:29,244] Trial 26 finished with value: 179.96430408233212 and parameters: {'alpha': 0.6602114663206433, 'beta': 0.928511905137334}. Best is trial 15 with value: 165.08390126111655.

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_20_12_41.pdf


[I 2025-06-12 14:30:54,613] Trial 28 finished with value: 179.53821209353487 and parameters: {'alpha': 0.9533967334606555, 'beta': 0.20223741779925575}. Best is trial 28 with value: 179.53821209353487.
[I 2025-06-12 14:30:54,624] Trial 21 finished with value: 149.20852166354967 and parameters: {'alpha': 0.27972851106367747, 'beta': 0.45016098010649774}. Best is trial 21 with value: 149.20852166354967.
[I 2025-06-12 14:30:54,631] Trial 17 finished with value: 163.9902870187214 and parameters: {'alpha': 0.6965484103739926, 'beta': 0.7755869446456418}. Best is trial 21 with value: 149.20852166354967.
[I 2025-06-12 14:30:54,648] Trial 31 finished with value: 153.8593194594419 and parameters: {'alpha': 0.047186192156074294, 'beta': 0.33306928001089947}. Best is trial 21 with value: 149.20852166354967.
[I 2025-06-12 14:30:54,711] Trial 1 finished with value: 155.71619948849366 and parameters: {'alpha': 0.5321166165789301, 'beta': 0.6290006425408714}. Best is trial 21 with value: 149.20852166

saved: 13_RL_agent_TDlearn_output\plots\plot_26_03_2025_16_21_25.pdf


[I 2025-06-12 14:31:20,033] Trial 20 finished with value: 184.74620016813694 and parameters: {'alpha': 0.03725704350662646, 'beta': 0.46671297071613677}. Best is trial 20 with value: 184.74620016813694.
[I 2025-06-12 14:31:20,118] Trial 4 finished with value: 178.00369139281878 and parameters: {'alpha': 0.3780048652318522, 'beta': 0.8710762059163247}. Best is trial 4 with value: 178.00369139281878.
[I 2025-06-12 14:31:20,159] Trial 8 finished with value: 174.68803079212145 and parameters: {'alpha': 0.6938595612712795, 'beta': 0.5588046155824837}. Best is trial 8 with value: 174.68803079212145.
[I 2025-06-12 14:31:20,164] Trial 6 finished with value: 197.818472155154 and parameters: {'alpha': 0.19431739143047475, 'beta': 0.30233434864877456}. Best is trial 8 with value: 174.68803079212145.
[I 2025-06-12 14:31:20,180] Trial 5 finished with value: 172.0391882823527 and parameters: {'alpha': 0.06782501143403244, 'beta': 0.6325131072166535}. Best is trial 5 with value: 172.0391882823527.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_10_53_23.pdf


[I 2025-06-12 14:31:44,800] Trial 21 finished with value: 132.81186749888434 and parameters: {'alpha': 0.48166879084750525, 'beta': 0.43580177822044225}. Best is trial 21 with value: 132.81186749888434.
[I 2025-06-12 14:31:45,832] Trial 15 finished with value: 128.00992928315407 and parameters: {'alpha': 0.2877350728449673, 'beta': 0.38720723544564073}. Best is trial 15 with value: 128.00992928315407.
[I 2025-06-12 14:31:45,874] Trial 14 finished with value: 134.60981462733292 and parameters: {'alpha': 0.2641364015373462, 'beta': 0.5544063397042187}. Best is trial 15 with value: 128.00992928315407.
[I 2025-06-12 14:31:45,900] Trial 25 finished with value: 139.5906499114368 and parameters: {'alpha': 0.884150580774254, 'beta': 0.22533901329854594}. Best is trial 15 with value: 128.00992928315407.
[I 2025-06-12 14:31:45,912] Trial 29 finished with value: 148.43386351537265 and parameters: {'alpha': 0.6224142345893653, 'beta': 0.6992760222986564}. Best is trial 15 with value: 128.009929283

saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_14_31_40.pdf


[I 2025-06-12 14:32:11,450] Trial 4 finished with value: 179.39996274298773 and parameters: {'alpha': 0.2441047997356627, 'beta': 0.9285979864457099}. Best is trial 4 with value: 179.39996274298773.
[I 2025-06-12 14:32:11,519] Trial 12 finished with value: 188.76608784161462 and parameters: {'alpha': 0.20243359631992167, 'beta': 0.226903677868089}. Best is trial 4 with value: 179.39996274298773.
[I 2025-06-12 14:32:11,568] Trial 19 finished with value: 214.35638192690544 and parameters: {'alpha': 0.4164326348376537, 'beta': 0.14424305144860106}. Best is trial 4 with value: 179.39996274298773.
[I 2025-06-12 14:32:11,582] Trial 13 finished with value: 160.7984170025797 and parameters: {'alpha': 0.32610322218374554, 'beta': 0.6077509437954973}. Best is trial 13 with value: 160.7984170025797.
[I 2025-06-12 14:32:11,587] Trial 16 finished with value: 172.11812412812947 and parameters: {'alpha': 0.6324873289802035, 'beta': 0.8101427664823593}. Best is trial 13 with value: 160.7984170025797.


saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_12_21_16.pdf


[I 2025-06-12 14:32:37,081] Trial 8 finished with value: 194.07706786781765 and parameters: {'alpha': 0.07940554288214376, 'beta': 0.23925794119045954}. Best is trial 8 with value: 194.07706786781765.
[I 2025-06-12 14:32:37,360] Trial 7 finished with value: 164.88885434377548 and parameters: {'alpha': 0.3615203934380737, 'beta': 0.48724072057067597}. Best is trial 7 with value: 164.88885434377548.
[I 2025-06-12 14:32:37,364] Trial 12 finished with value: 163.21101996525385 and parameters: {'alpha': 0.46285267564138016, 'beta': 0.5783308927689608}. Best is trial 12 with value: 163.21101996525385.
[I 2025-06-12 14:32:37,426] Trial 3 finished with value: 166.98949331303749 and parameters: {'alpha': 0.16470297933445255, 'beta': 0.419995624175207}. Best is trial 12 with value: 163.21101996525385.
[I 2025-06-12 14:32:37,444] Trial 11 finished with value: 283.6870238956028 and parameters: {'alpha': 0.5156267185914086, 'beta': 0.0621660140492443}. Best is trial 12 with value: 163.2110199652538

saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_22_38_25.pdf


# now saving the model evaluation values

In [11]:
df_models_evaluation = pd.DataFrame({
    "participants": participants,
    "best_alpha": best_alpha_models,
    "best_beta": best_beta_models,
    "BIC": BIC_models,
    "AIC": AIC_models,
    "accuracy": accuracy_models,
    "precision": precision_models,
    "sensitivity_recall": sensitivity_recall_models,
    "specificity": specificity_models,
    "f1_score": f1_score_models,
    "mcFadden_r2": mcFadden_r2_models,
    "r2": r2_models
})

file_path = os.path.join(output_dir_model_evaluation, "models_evaluation.csv")
df_models_evaluation.to_csv(file_path, index=False)