# TD learning; the agent that can see

# Remember to check the number of samples for alpha and beta

now I'm gonna add numbers to the model. 

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import confusion_matrix
np.random.seed(42)
from joblib import Parallel, delayed
import matplotlib.tri as tri
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
from scipy.interpolate import RBFInterpolator
import matplotlib.ticker as mticker
import itertools
from sklearn.metrics import r2_score
import optuna

# important directories

In [5]:
folder_path = 'data_risk_added'


output_dir_model_evaluation = "13_RL_agent_TDlearn_output"
os.makedirs(output_dir_model_evaluation, exist_ok=True)

output_dir_plots = os.path.join(output_dir_model_evaluation, "plots")
os.makedirs(output_dir_plots, exist_ok=True)

output_dir_model_behavior = os.path.join(output_dir_model_evaluation, "model_behavior")
os.makedirs(output_dir_model_behavior, exist_ok=True)

In [6]:

dataframes = [pd.read_excel(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.endswith('.xlsx')]


n_participant = len(dataframes)
print(f"there are {n_participant} participants.")


dataframes[0]

there are 35 participants.


Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,is_within_IQR,risk
0,2609,uniform,789,lose,4,2,335,9.5,0,response,arrowdown,1,0,1,0.375
1,597,uniform,853,win,9,4,407,10,1,response,arrowup,1,0,1,0.000
2,188,uniform,904,win,4,7,504,10.5,2,response,arrowdown,1,0,1,0.375
3,423,uniform,916,win,2,4,434,11,3,response,arrowdown,1,0,1,0.125
4,549,uniform,806,win,5,7,287,11.5,4,response,arrowdown,1,0,1,0.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,761,high,913,win,7,8,382,80,125,response,arrowdown,4,0,1,0.447
273,596,low,921,win,4,3,318,80.5,83,response,arrowup,4,0,1,0.385
274,414,low,950,win,2,7,335,81,77,response,arrowdown,4,0,1,0.243
275,1371,uniform,842,win,6,4,615,81.5,35,response,arrowup,4,0,1,0.375


### I want to make participant file name for the model_evaluation.csv and that is I'm gonna take each data name task_data_07_11_2024_17_23_43.xlsx and extract "07_11_2024_17_23_43" and this should be the participant name in the dataset. 

In [7]:
participants = [os.path.splitext(file)[0].replace("task_data_", "")
    for file in os.listdir(folder_path) if file.endswith('.xlsx')]

# policy initilization for the model
now I need to find the prior policy amounts. for that I am going to put the percentage of downarrow and up arrow for each distribution.

In [8]:
df_combined = pd.concat(dataframes, ignore_index=True)

df_combined = df_combined[df_combined['outcome'].str.lower() != 'na'].reset_index(drop=True)
 

desired_order = ["uniform", "low", "high"]  


cards_sorted = sorted(df_combined["myCard"].unique())
dist_sorted = [d for d in desired_order if d in df_combined["distribution"].unique()]
choice_sorted = sorted(df_combined["choice"].unique())


card_idx = {card: i for i, card in enumerate(cards_sorted)}
dist_idx = {dist: i for i, dist in enumerate(dist_sorted)}
choice_idx = {choice: i for i, choice in enumerate(choice_sorted)}


matrix_3d = np.zeros((len(cards_sorted), len(dist_sorted), len(choice_sorted)))


for _, row in df_combined.iterrows():
    i = card_idx[row["myCard"]]-1
    j = dist_idx[row["distribution"]]
    k = choice_idx[row["choice"]]
    matrix_3d[i, j, k] += 1  


total_per_card_dist = matrix_3d.sum(axis=2, keepdims=True)

# compute percentages, avoiding division by zero
with np.errstate(divide='ignore', invalid='ignore'):
    percentage_matrix = np.divide(matrix_3d, total_per_card_dist, where=total_per_card_dist != 0)

# convert to a DataFrame for easy visualization
percentage_list = []
for i, card in enumerate(cards_sorted):
    for j, dist in enumerate(dist_sorted):
        for k, choice in enumerate(choice_sorted):
            percentage_list.append({
                "myCard": card,
                "distribution": dist,  # Now follows "uniform", "low", "high" order
                "choice": choice,
                "percentage": percentage_matrix[i, j, k]
            })

df_percentages = pd.DataFrame(percentage_list)
df_percentages

Unnamed: 0,myCard,distribution,choice,percentage
0,1,uniform,arrowdown,0.965217
1,1,uniform,arrowup,0.034783
2,1,low,arrowdown,0.922399
3,1,low,arrowup,0.077601
4,1,high,arrowdown,0.958042
5,1,high,arrowup,0.041958
6,2,uniform,arrowdown,0.967836
7,2,uniform,arrowup,0.032164
8,2,low,arrowdown,0.748479
9,2,low,arrowup,0.251521


In [9]:
np.shape(percentage_matrix)

(9, 3, 2)

In [10]:
actions = { "arrowdown": 0, "arrowup": 1}
distributions_map = { "uniform": 0, "low": 1,  "high": 2}
card_numbers = list(range(1, 10))

policy_table = percentage_matrix 

Q_table_init = np.random.normal(0, 0.1, (len(card_numbers), len(distributions_map), len(actions)))
# having a q-table based on the policies
Q_table_init = policy_table * np.mean(Q_table_init) 
Q_table = Q_table_init.copy()

#############################################################################################
# having a q-table that starts with 0! this was not a good initilization so i changed it.
# Q_table = np.zeros((len(distributions_map), len(actions)))  # 3 distributions × 2 actions
#############################################################################################

print("policy: \n",np.shape(policy_table))
print("\n Q_table: \n",np.shape(Q_table))



policy: 
 (9, 3, 2)

 Q_table: 
 (9, 3, 2)


In [11]:
def epsilon_greedy(Q_values, epsilon):    
    probs = np.full_like(Q_values, epsilon / Q_values.shape[-1], dtype=float)  # initialize with exploration probability
    best_actions = np.argmax(Q_values, axis=-1)  # find the best action for each state
    np.put_along_axis(probs, np.expand_dims(best_actions, axis=-1), 1 - epsilon + (epsilon / Q_values.shape[-1]), axis=-1)
    return probs



def train_rescorla_wagner(df, alpha, beta, Q_init=None):
    if Q_init is None:
        Q_init = Q_table.copy()
    Q_values = Q_init.copy()
    q_value_pairs = []
    choices = []
    predicted_probs = []
    distributions = []
    card_numbers = []
    
    for _, row in df.iterrows():
        action = actions[row["choice"]] 
        distribution = distributions_map[row["distribution"]] 
        card_number = row["myCard"]-1 # since I'm using this as an index! I need to do -1 to make the 1 to 9 cards come to 0 to 8
        reward = 0.5 if row["outcome"] == "win" else -0.5


        probs = epsilon_greedy(Q_values, beta)
        predicted_probs.append(probs[card_number][distribution][action])
        
        prediction_error = reward - Q_values[card_number][distribution][action]
        Q_values[card_number][distribution][action] += alpha * prediction_error
        
        q_value_pairs.append(Q_values.copy())
        choices.append(action)
        distributions.append(distribution)
        card_numbers.append(card_number)
        

    return np.array(q_value_pairs), np.array(choices), np.array(predicted_probs), np.array(distributions), np.array(card_numbers)


# this is for the sake of parallel computing
def compute_log_likelihood(alpha, beta, df_all, Q_table):
    Q_init_participant = Q_table.copy()
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, alpha, beta, Q_init=Q_init_participant.copy())
    
    predicted_probs = np.clip(predicted_probs, 1e-6, 1)  # prevent log(0)
    log_likelihood = np.sum(np.log(predicted_probs))
    
    return (alpha, beta, log_likelihood)


In [12]:
num_of_samples = 100
# num_of_samples = 1000
alpha_min = 0
alpha_max = 1
beta_min = 0
beta_max  = 1
alpha_samples = np.random.uniform(alpha_min, alpha_max + np.finfo(float).eps, num_of_samples)
beta_samples = np.random.uniform(beta_min, beta_max + np.finfo(float).eps, num_of_samples)

In [None]:
BIC_models = []
AIC_models = []
best_alpha_models = []
best_beta_models = []
accuracy_models = []
precision_models = []
sensitivity_recall_models = []
specificity_models = []
f1_score_models = []
mcFadden_r2_models = []
r2_models = []

for idx, df_all in enumerate(dataframes):
    Q_init_participant = Q_table.copy()
    
    df_all = df_all[df_all['outcome'].str.lower() != 'na'].reset_index(drop=True)


    def objective(trial):
        alpha = trial.suggest_float("alpha", alpha_min, alpha_max)
        beta  = trial.suggest_float("beta", beta_min, beta_max)

        # negative log-likelihood (Optuna minimises)
        _, _, ll = compute_log_likelihood(alpha, beta,
                                        df_all,
                                        Q_init_participant.copy())
        return -ll

    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=400, n_jobs=-1)

    best_alpha = study.best_params["alpha"]
    best_beta  = study.best_params["beta"]
    best_log_likelihood = -study.best_value

    # keep this for plotting later
    results_df = study.trials_dataframe()
    results_df["alpha"] = results_df["params_alpha"]
    results_df["beta"]  = results_df["params_beta"]
    results_df["log_likelihood"] = -results_df["value"]

    # model prediction 
    
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, best_alpha, best_beta, Q_init=Q_init_participant.copy())
    
    
    predicted_choices = []
    for trial in range(len(card_numbers)):
        test_action_probs = epsilon_greedy(q_values[trial], best_beta)
        p_arrowup = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowup"]]
        p_arrow_down = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowdown"]]
        # choosing 1 or 0 based on the softmax probabilities:
        predicted_choices.append(np.random.choice([1, 0], p=[p_arrowup, p_arrow_down]))

    # finding out model total reward based on the model's predicted choices
    total_reward = [] 
    for i in range(len(predicted_choices)):
        if len(total_reward)> 0:
            last_reward = total_reward[-1]  #  the last reward value
        else:
            last_reward = 10 # initial reward is $10
        
        if ((df_all.loc[i, 'myCard'] > df_all.loc[i, 'yourCard'] and predicted_choices[i] == 1) or
            (df_all.loc[i, 'myCard'] < df_all.loc[i, 'yourCard'] and predicted_choices[i] == 0)):
            total_reward.append(last_reward + 0.5)
        else:
            total_reward.append(last_reward - 0.5)

    
   
       # confusion matrix:
    conf_matrix = confusion_matrix(choices, predicted_choices)
    TN, FP, FN, TP = conf_matrix.ravel()  # unpacking the confusion matrix
    # acc
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision: From the ones that we’ve announced them as up/down, which ones are really up/down?
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall or sensitivity : true positive rate
    sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # specificity : true negative rate
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    # f1 Score
    f1_score = 2 * (precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0

    
    # bayes information criterion:
    n_trials = len(df_all)
    k = 2  # number of free parameters: alpha and beta
    BIC = k * np.log(n_trials) - 2 * best_log_likelihood # this is BIC formula based on the log lkelihode I found before

    

    # Akaike  information criterion(AIC):
    AIC = 2 * k - 2 * best_log_likelihood 


    # mcFadden r-squared:
    p_null = np.mean(choices)  # probability of choosing "1" in the dataset
    log_likelihood_null = np.sum(choices * np.log(p_null) + (1 - choices) * np.log(1 - p_null))
    mcFadden_r2 = 1 - (best_log_likelihood / log_likelihood_null)

    # r-squared
    r2 = r2_score(choices, predicted_choices)
    
    # saving models evaluation variables:
    best_alpha_models.append(best_alpha)
    best_beta_models.append(best_beta)
    BIC_models.append(BIC)
    AIC_models.append(AIC)
    accuracy_models.append(accuracy)
    precision_models.append(precision)
    sensitivity_recall_models.append(sensitivity_recall)
    specificity_models.append(specificity)
    f1_score_models.append(f1_score)
    mcFadden_r2_models.append(mcFadden_r2)
    r2_models.append(r2)


    ###########################################################################################
    ## visulization
    ###########################################################################################

    fig, axes = plt.subplots(1, 3, figsize=(19, 6))

    plots_smooth_level = 20


#############################################
    # Density Plot (KDE)
    sns.kdeplot(
        x=results_df["alpha"], 
        y=results_df["beta"], 
        fill=True, 
        cmap="viridis", 
        ax=axes[0], 
        bw_adjust=1.8,  # Increase for smoother density
        levels=plots_smooth_level,  # More contour levels
        thresh=0  # Ensure density is plotted across all values
    )
    mappable = axes[0].collections[0]
    cbar = fig.colorbar(mappable, ax=axes[0], label="density", fraction=0.046, pad=0.04)  
    cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))  # 2 decimal places
    cbar.ax.set_ylabel("density", fontsize=12, fontweight='bold')
    cbar.ax.tick_params(labelsize=12)

    axes[0].set_xlim(alpha_min, alpha_max)
    axes[0].set_ylim(beta_min, beta_max)
    axes[0].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[0].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[0].set_title("density of α and ε joint probability", fontsize=16, fontweight='bold')
    axes[0].tick_params(axis='both', labelsize=14)
#############################################


#############################################
    # Log Likelihood 

    
    alpha_step = 0.1
    beta_step = 0.1
    alpha_bins = np.arange(alpha_min, alpha_max+ alpha_step, alpha_step)  # bins from 0 to 1 with step 0.1
    beta_bins = np.arange(beta_min, beta_max + beta_step, beta_step)       # bins from 0 to 1 with step 0.1

    results_df["alpha_binned"] = pd.cut(results_df["alpha"], bins=alpha_bins, labels=alpha_bins[:-1], include_lowest=True)
    results_df["beta_binned"] = pd.cut(results_df["beta"], bins=beta_bins, labels=beta_bins[:-1], include_lowest=True)

    heatmap_data = results_df.groupby(
    ["beta_binned", "alpha_binned"], observed=False)["log_likelihood"].mean().unstack()

    heatmap_data.index = heatmap_data.index.astype(float)
    heatmap_data.columns = heatmap_data.columns.astype(float)

    sns.heatmap(
        heatmap_data, 
        cmap="Blues", 
        cbar=True,
        ax=axes[1]
    )
    axes[1].set_xticks(np.arange(len(heatmap_data.columns)))  
    axes[1].set_xticklabels([f"{x:.1f}" for x in heatmap_data.columns], rotation=45)

    axes[1].set_yticks(np.arange(len(heatmap_data.index))) 
    axes[1].set_yticklabels([f"{x:.1f}" for x in heatmap_data.index]) 


    axes[1].set_xlabel("learning rate (α)", fontsize=14, fontweight='bold')
    axes[1].set_ylabel("epsilon (ε)", fontsize=14, fontweight='bold')
    axes[1].set_title("log likelihood for combinations of α and ε", fontsize=16, fontweight='bold')
    axes[1].tick_params(axis='both', labelsize=14)
    axes[1].invert_yaxis()


    

#############################################


#############################################
    # Confusion Matrix
    heatmap_cmap_color = mcolors.LinearSegmentedColormap.from_list("warm_red", ["#fff5e6", "#ff5733"])
    sns.heatmap(
        conf_matrix, annot=True, fmt="d", cmap=heatmap_cmap_color,
        xticklabels=["arrowdown", "arrowup"], 
        yticklabels=["arrowdown", "arrowup"], 
        ax=axes[2], 
        cbar=False
    )

    axes[2].set_xlabel("prediction", fontsize=14, fontweight='bold')
    axes[2].set_ylabel("true label", fontsize=14, fontweight='bold')
    axes[2].set_title(f"confusion matrix (α={best_alpha:.2f}, ε={best_beta:.2f})", fontsize=16, fontweight='bold')
    axes[2].tick_params(axis='both', labelsize=14)


#############################################
    # saving figures
    plt.tight_layout(rect=[0, 0, 1, 0.9]) 
    fig.suptitle(f'participant {idx}', fontsize=18, fontweight='bold', y=0.95)

    filename = os.path.join(output_dir_plots, f"plot_{participants[idx]}.pdf")
    plt.savefig(filename, format='pdf')
    plt.close(fig)

    print(f"saved: {filename}")


#############################################

    # saving model behavior
    q_values_reshaped = [q_values[i].tolist() for i in range(n_trials)]  # convert each (9,3,2) array into a list format

    df_model_behavior = pd.DataFrame({
        "model_choices": predicted_choices,
        "participant_choices": choices,
        "model_total_reward": total_reward,
        "participant_total_reward": df_all["totalReward"],
        "q_val": q_values_reshaped  
    })

    file_path = os.path.join(output_dir_model_behavior, f"model_behavior_{participants[idx]}.csv")
    df_model_behavior.to_csv(file_path, index=False)



[I 2025-06-10 14:04:28,765] A new study created in memory with name: no-name-fc1e2994-4cf0-436c-80b7-c2f56a5975d7
[I 2025-06-10 14:04:30,206] Trial 6 finished with value: 183.37755054341807 and parameters: {'alpha': 0.9542582447418688, 'beta': 0.9690447252812161}. Best is trial 6 with value: 183.37755054341807.
[I 2025-06-10 14:04:30,231] Trial 0 finished with value: 189.02134035576213 and parameters: {'alpha': 0.18337240800615395, 'beta': 0.13641216149549296}. Best is trial 6 with value: 183.37755054341807.
[I 2025-06-10 14:04:30,252] Trial 3 finished with value: 156.63996744321855 and parameters: {'alpha': 0.6019635664095095, 'beta': 0.5525626494222762}. Best is trial 3 with value: 156.63996744321855.
[I 2025-06-10 14:04:30,279] Trial 8 finished with value: 156.18140257177186 and parameters: {'alpha': 0.2926920246120265, 'beta': 0.32404235829749906}. Best is trial 8 with value: 156.18140257177186.
[I 2025-06-10 14:04:30,279] Trial 5 finished with value: 165.99435826280478 and paramet

saved: 13_RL_agent_TDlearn_output\plots\plot_07_11_2024_17_23_43.pdf


[I 2025-06-10 14:04:59,271] Trial 1 finished with value: 176.22552984495104 and parameters: {'alpha': 0.19437488508699585, 'beta': 0.9263451278815003}. Best is trial 1 with value: 176.22552984495104.
[I 2025-06-10 14:04:59,501] Trial 2 finished with value: 166.53580907432652 and parameters: {'alpha': 0.6333972351445204, 'beta': 0.7457266146225078}. Best is trial 2 with value: 166.53580907432652.
[I 2025-06-10 14:05:00,104] Trial 5 finished with value: 242.7051532345139 and parameters: {'alpha': 0.3563282042086916, 'beta': 0.040106341577509586}. Best is trial 2 with value: 166.53580907432652.
[I 2025-06-10 14:05:00,130] Trial 6 finished with value: 186.5333781362669 and parameters: {'alpha': 0.6084010603328659, 'beta': 0.9945583143598818}. Best is trial 2 with value: 166.53580907432652.
[I 2025-06-10 14:05:00,159] Trial 3 finished with value: 171.95611530095908 and parameters: {'alpha': 0.6058770573403934, 'beta': 0.835889264394192}. Best is trial 2 with value: 166.53580907432652.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_08_11_2024_13_03_29.pdf


[I 2025-06-10 14:05:27,774] Trial 2 finished with value: 257.7788722845855 and parameters: {'alpha': 0.9370815604636865, 'beta': 0.057511989959794785}. Best is trial 0 with value: 155.79417789230243.
[I 2025-06-10 14:05:28,096] Trial 4 finished with value: 330.7286615311358 and parameters: {'alpha': 0.7404351368848182, 'beta': 0.01949597751463039}. Best is trial 0 with value: 155.79417789230243.
[I 2025-06-10 14:05:28,259] Trial 3 finished with value: 164.72205767518645 and parameters: {'alpha': 0.34650153488531055, 'beta': 0.8304489342764275}. Best is trial 0 with value: 155.79417789230243.
[I 2025-06-10 14:05:28,287] Trial 5 finished with value: 185.56314848692438 and parameters: {'alpha': 0.23093467844519744, 'beta': 0.9899997609725391}. Best is trial 0 with value: 155.79417789230243.
[I 2025-06-10 14:05:28,345] Trial 6 finished with value: 189.10149868267368 and parameters: {'alpha': 0.6584727927882255, 'beta': 0.182230845717676}. Best is trial 0 with value: 155.79417789230243.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_11_11_2024_16_46_44.pdf


[I 2025-06-10 14:05:56,649] Trial 1 finished with value: 530.5039733036542 and parameters: {'alpha': 0.7002315697555356, 'beta': 0.002431731864434794}. Best is trial 0 with value: 188.7703018261856.
[I 2025-06-10 14:05:56,792] Trial 2 finished with value: 335.03178384663545 and parameters: {'alpha': 0.003861729107535572, 'beta': 0.3742027894867658}. Best is trial 0 with value: 188.7703018261856.
[I 2025-06-10 14:05:56,886] Trial 3 finished with value: 156.4557681322934 and parameters: {'alpha': 0.236187937578205, 'beta': 0.599805941326296}. Best is trial 3 with value: 156.4557681322934.
[I 2025-06-10 14:05:57,057] Trial 4 finished with value: 207.58953807932517 and parameters: {'alpha': 0.26796054843836625, 'beta': 0.12966881472316338}. Best is trial 3 with value: 156.4557681322934.
[I 2025-06-10 14:05:57,269] Trial 5 finished with value: 163.38179478065246 and parameters: {'alpha': 0.7650181207303886, 'beta': 0.5524769045494655}. Best is trial 3 with value: 156.4557681322934.
[I 2025-

saved: 13_RL_agent_TDlearn_output\plots\plot_12_11_2024_00_15_17.pdf


[I 2025-06-10 14:06:26,302] Trial 2 finished with value: 167.34622998203793 and parameters: {'alpha': 0.8257260034761543, 'beta': 0.8173420723359038}. Best is trial 0 with value: 147.56020893494514.
[I 2025-06-10 14:06:26,504] Trial 4 finished with value: 184.69089171716524 and parameters: {'alpha': 0.19825881615847007, 'beta': 0.9835855159062186}. Best is trial 0 with value: 147.56020893494514.
[I 2025-06-10 14:06:26,680] Trial 5 finished with value: 176.82433554984698 and parameters: {'alpha': 0.6654970370099836, 'beta': 0.9144791331602161}. Best is trial 0 with value: 147.56020893494514.
[I 2025-06-10 14:06:26,817] Trial 6 finished with value: 143.70445808311092 and parameters: {'alpha': 0.11973175738525643, 'beta': 0.505433171384061}. Best is trial 6 with value: 143.70445808311092.
[I 2025-06-10 14:06:26,906] Trial 7 finished with value: 168.75599757126375 and parameters: {'alpha': 0.28601417742005075, 'beta': 0.8630352555463775}. Best is trial 6 with value: 143.70445808311092.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_10_46_21.pdf


[I 2025-06-10 14:06:57,269] Trial 15 finished with value: 199.49773168411411 and parameters: {'alpha': 0.5400900455026731, 'beta': 0.3578529117590037}. Best is trial 15 with value: 199.49773168411411.
[I 2025-06-10 14:06:57,289] Trial 7 finished with value: 313.6411755743364 and parameters: {'alpha': 0.08871315507059074, 'beta': 0.06820716218827128}. Best is trial 15 with value: 199.49773168411411.
[I 2025-06-10 14:06:57,301] Trial 23 finished with value: 190.4975506182991 and parameters: {'alpha': 0.03474681477230024, 'beta': 0.5506816817028563}. Best is trial 23 with value: 190.4975506182991.
[I 2025-06-10 14:06:57,309] Trial 3 finished with value: 180.25591604016944 and parameters: {'alpha': 0.3405403995209729, 'beta': 0.9062606952295084}. Best is trial 3 with value: 180.25591604016944.
[I 2025-06-10 14:06:57,315] Trial 13 finished with value: 176.18857676839554 and parameters: {'alpha': 0.6000713797987759, 'beta': 0.7680805264212647}. Best is trial 13 with value: 176.18857676839554

saved: 13_RL_agent_TDlearn_output\plots\plot_13_11_2024_14_45_52.pdf


[I 2025-06-10 14:07:23,842] Trial 13 finished with value: 156.58652296663143 and parameters: {'alpha': 0.7893141482651149, 'beta': 0.5407455091645464}. Best is trial 13 with value: 156.58652296663143.
[I 2025-06-10 14:07:23,858] Trial 27 finished with value: 159.23155026735293 and parameters: {'alpha': 0.7752763101111939, 'beta': 0.415926764605022}. Best is trial 13 with value: 156.58652296663143.
[I 2025-06-10 14:07:23,880] Trial 17 finished with value: 163.85862976856464 and parameters: {'alpha': 0.32394278246880204, 'beta': 0.8082299338510859}. Best is trial 13 with value: 156.58652296663143.
[I 2025-06-10 14:07:23,885] Trial 29 finished with value: 182.0393824659604 and parameters: {'alpha': 0.6075799201987999, 'beta': 0.9575331815139614}. Best is trial 13 with value: 156.58652296663143.
[I 2025-06-10 14:07:23,889] Trial 16 finished with value: 211.395452397433 and parameters: {'alpha': 0.8462479646866758, 'beta': 0.12716924606356428}. Best is trial 13 with value: 156.5865229666314

saved: 13_RL_agent_TDlearn_output\plots\plot_14_11_2024_21_46_47.pdf


[I 2025-06-10 14:07:49,023] Trial 11 finished with value: 180.43950752748805 and parameters: {'alpha': 0.09561453946895071, 'beta': 0.18014553542446998}. Best is trial 11 with value: 180.43950752748805.
[I 2025-06-10 14:07:49,160] Trial 0 finished with value: 170.07760088028985 and parameters: {'alpha': 0.9788756861657423, 'beta': 0.7874048903912784}. Best is trial 0 with value: 170.07760088028985.
[I 2025-06-10 14:07:49,186] Trial 15 finished with value: 166.69221443789118 and parameters: {'alpha': 0.4905276066531806, 'beta': 0.40426063326166717}. Best is trial 15 with value: 166.69221443789118.
[I 2025-06-10 14:07:49,197] Trial 2 finished with value: 159.7830424172045 and parameters: {'alpha': 0.292371282570811, 'beta': 0.6355135187086453}. Best is trial 2 with value: 159.7830424172045.
[I 2025-06-10 14:07:49,201] Trial 9 finished with value: 166.19778545007927 and parameters: {'alpha': 0.6649809580483376, 'beta': 0.6909997738099563}. Best is trial 2 with value: 159.7830424172045.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_15_11_2024_11_43_48.pdf


[I 2025-06-10 14:08:15,047] Trial 28 finished with value: 168.57055328780433 and parameters: {'alpha': 0.9931632292078133, 'beta': 0.4361834543949543}. Best is trial 28 with value: 168.57055328780433.
[I 2025-06-10 14:08:15,077] Trial 2 finished with value: 175.5466688401467 and parameters: {'alpha': 0.9020682803555142, 'beta': 0.8762007682409867}. Best is trial 28 with value: 168.57055328780433.
[I 2025-06-10 14:08:15,077] Trial 19 finished with value: 236.35996074376538 and parameters: {'alpha': 0.7875064119964297, 'beta': 0.12083302387400308}. Best is trial 28 with value: 168.57055328780433.
[I 2025-06-10 14:08:15,077] Trial 22 finished with value: 211.0225348269279 and parameters: {'alpha': 0.7248350504370619, 'beta': 0.1786311430110351}. Best is trial 28 with value: 168.57055328780433.
[I 2025-06-10 14:08:15,094] Trial 6 finished with value: 169.0250823882655 and parameters: {'alpha': 0.3020573722933536, 'beta': 0.8199328702079324}. Best is trial 28 with value: 168.57055328780433.

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_15_25_39.pdf


[I 2025-06-10 14:08:40,253] Trial 18 finished with value: 188.21527372025636 and parameters: {'alpha': 0.5957312086117981, 'beta': 0.3735946982544449}. Best is trial 18 with value: 188.21527372025636.
[I 2025-06-10 14:08:40,282] Trial 23 finished with value: 180.39583488154528 and parameters: {'alpha': 0.24432621788673847, 'beta': 0.923054913736325}. Best is trial 23 with value: 180.39583488154528.
[I 2025-06-10 14:08:40,298] Trial 28 finished with value: 205.7059861141392 and parameters: {'alpha': 0.3138103641478691, 'beta': 0.24062382780361857}. Best is trial 23 with value: 180.39583488154528.
[I 2025-06-10 14:08:40,298] Trial 9 finished with value: 171.946662948035 and parameters: {'alpha': 0.8150725849679815, 'beta': 0.6427659295035657}. Best is trial 9 with value: 171.946662948035.
[I 2025-06-10 14:08:40,499] Trial 5 finished with value: 296.7589268108692 and parameters: {'alpha': 0.4073504094638437, 'beta': 0.06550308051498499}. Best is trial 9 with value: 171.946662948035.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_17_11_2024_23_57_47.pdf


[I 2025-06-10 14:09:05,904] Trial 16 finished with value: 179.11240827575247 and parameters: {'alpha': 0.22215488969487673, 'beta': 0.9400478277354971}. Best is trial 16 with value: 179.11240827575247.
[I 2025-06-10 14:09:05,921] Trial 26 finished with value: 149.1990151776039 and parameters: {'alpha': 0.6412286461422299, 'beta': 0.4509558550178835}. Best is trial 26 with value: 149.1990151776039.
[I 2025-06-10 14:09:05,966] Trial 27 finished with value: 206.40397312812829 and parameters: {'alpha': 0.022408505211407692, 'beta': 0.2942319758975951}. Best is trial 26 with value: 149.1990151776039.
[I 2025-06-10 14:09:05,966] Trial 13 finished with value: 310.43030698675153 and parameters: {'alpha': 0.2873127478863652, 'beta': 0.01606079316750353}. Best is trial 26 with value: 149.1990151776039.
[I 2025-06-10 14:09:05,976] Trial 12 finished with value: 169.8208966135466 and parameters: {'alpha': 0.47891624130843613, 'beta': 0.862689010462981}. Best is trial 26 with value: 149.199015177603

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_13_12_31.pdf


[I 2025-06-10 14:09:31,708] Trial 20 finished with value: 164.67801806710477 and parameters: {'alpha': 0.1685646966902964, 'beta': 0.7579431614405064}. Best is trial 20 with value: 164.67801806710477.
[I 2025-06-10 14:09:31,977] Trial 8 finished with value: 180.56943450442412 and parameters: {'alpha': 0.13052452933473213, 'beta': 0.23687634607537744}. Best is trial 20 with value: 164.67801806710477.
[I 2025-06-10 14:09:32,018] Trial 14 finished with value: 168.41719277385727 and parameters: {'alpha': 0.741322036159393, 'beta': 0.43868566280367594}. Best is trial 20 with value: 164.67801806710477.
[I 2025-06-10 14:09:32,112] Trial 18 finished with value: 174.8409724582188 and parameters: {'alpha': 0.6784198544291545, 'beta': 0.8669872202630727}. Best is trial 20 with value: 164.67801806710477.
[I 2025-06-10 14:09:32,127] Trial 28 finished with value: 170.44904403408881 and parameters: {'alpha': 0.032841753064787205, 'beta': 0.6438123137950597}. Best is trial 20 with value: 164.678018067

saved: 13_RL_agent_TDlearn_output\plots\plot_18_03_2025_20_59_56.pdf


[I 2025-06-10 14:09:57,169] Trial 0 finished with value: 165.43900874195316 and parameters: {'alpha': 0.6108635516220806, 'beta': 0.5449542584213947}. Best is trial 0 with value: 165.43900874195316.
[I 2025-06-10 14:09:57,783] Trial 20 finished with value: 164.95115334709814 and parameters: {'alpha': 0.6642699092186283, 'beta': 0.6105564454426944}. Best is trial 20 with value: 164.95115334709814.
[I 2025-06-10 14:09:57,856] Trial 24 finished with value: 303.1128852061824 and parameters: {'alpha': 0.01914412937299459, 'beta': 0.06862437154788703}. Best is trial 20 with value: 164.95115334709814.
[I 2025-06-10 14:09:57,909] Trial 5 finished with value: 177.1274546974513 and parameters: {'alpha': 0.5911378030013557, 'beta': 0.3535586459137575}. Best is trial 20 with value: 164.95115334709814.
[I 2025-06-10 14:09:57,917] Trial 17 finished with value: 165.5514217023395 and parameters: {'alpha': 0.15553905189799344, 'beta': 0.764167788712446}. Best is trial 20 with value: 164.95115334709814.

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_13_31_43.pdf


[I 2025-06-10 14:10:23,104] Trial 28 finished with value: 168.0083840131067 and parameters: {'alpha': 0.580668292975473, 'beta': 0.7552440663180646}. Best is trial 28 with value: 168.0083840131067.
[I 2025-06-10 14:10:23,154] Trial 18 finished with value: 188.96704533391238 and parameters: {'alpha': 0.08970709657773879, 'beta': 0.3036709834478142}. Best is trial 28 with value: 168.0083840131067.
[I 2025-06-10 14:10:23,154] Trial 17 finished with value: 175.78021055966485 and parameters: {'alpha': 0.46209065943031047, 'beta': 0.8902208349963435}. Best is trial 28 with value: 168.0083840131067.
[I 2025-06-10 14:10:23,174] Trial 2 finished with value: 282.24963189267794 and parameters: {'alpha': 0.7126720425018055, 'beta': 0.06338795985779133}. Best is trial 28 with value: 168.0083840131067.
[I 2025-06-10 14:10:23,176] Trial 20 finished with value: 207.30923585170927 and parameters: {'alpha': 0.32102771213303904, 'beta': 0.14910257600965815}. Best is trial 28 with value: 168.0083840131067

saved: 13_RL_agent_TDlearn_output\plots\plot_18_11_2024_15_43_17.pdf


[I 2025-06-10 14:10:48,822] Trial 25 finished with value: 173.14163132391036 and parameters: {'alpha': 0.9952615694390099, 'beta': 0.3622656029622896}. Best is trial 25 with value: 173.14163132391036.
[I 2025-06-10 14:10:48,827] Trial 2 finished with value: 184.67815305445495 and parameters: {'alpha': 0.2402942950511201, 'beta': 0.1731229304306403}. Best is trial 25 with value: 173.14163132391036.
[I 2025-06-10 14:10:48,829] Trial 4 finished with value: 162.95739445321402 and parameters: {'alpha': 0.24159598332576493, 'beta': 0.28996682665417706}. Best is trial 4 with value: 162.95739445321402.
[I 2025-06-10 14:10:48,835] Trial 21 finished with value: 164.7940908134421 and parameters: {'alpha': 0.5180363656443439, 'beta': 0.48992361975876153}. Best is trial 4 with value: 162.95739445321402.
[I 2025-06-10 14:10:48,837] Trial 3 finished with value: 181.15174889729855 and parameters: {'alpha': 0.8652725158857646, 'beta': 0.9425210586806968}. Best is trial 4 with value: 162.95739445321402.

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_14_28_20.pdf


[I 2025-06-10 14:11:15,036] Trial 3 finished with value: 384.7511361288331 and parameters: {'alpha': 0.874460848211365, 'beta': 0.009684093071045452}. Best is trial 3 with value: 384.7511361288331.
[I 2025-06-10 14:11:15,082] Trial 7 finished with value: 149.8587379621822 and parameters: {'alpha': 0.12085005549276251, 'beta': 0.34586387821790376}. Best is trial 23 with value: 145.6514829753512.
[I 2025-06-10 14:11:15,052] Trial 12 finished with value: 166.15516259633432 and parameters: {'alpha': 0.08143868887749561, 'beta': 0.8276814791365882}. Best is trial 12 with value: 166.15516259633432.
[I 2025-06-10 14:11:15,081] Trial 23 finished with value: 145.6514829753512 and parameters: {'alpha': 0.24535169259135503, 'beta': 0.4893569334357559}. Best is trial 23 with value: 145.6514829753512.
[I 2025-06-10 14:11:15,052] Trial 0 finished with value: 227.93573725155372 and parameters: {'alpha': 0.30736659389583487, 'beta': 0.055656724445506844}. Best is trial 0 with value: 227.93573725155372

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_17_03_01.pdf


[I 2025-06-10 14:11:40,586] Trial 0 finished with value: 158.7074544658006 and parameters: {'alpha': 0.6121272645173278, 'beta': 0.577682269894881}. Best is trial 0 with value: 158.7074544658006.
[I 2025-06-10 14:11:40,614] Trial 25 finished with value: 169.21648567009964 and parameters: {'alpha': 0.9087651926927933, 'beta': 0.3250424705912274}. Best is trial 0 with value: 158.7074544658006.
[I 2025-06-10 14:11:40,624] Trial 11 finished with value: 235.35017119756682 and parameters: {'alpha': 0.2806342340457946, 'beta': 0.03979721259943736}. Best is trial 0 with value: 158.7074544658006.
[I 2025-06-10 14:11:40,625] Trial 6 finished with value: 136.4525672367332 and parameters: {'alpha': 0.0591666382319439, 'beta': 0.3242983079934397}. Best is trial 6 with value: 136.4525672367332.
[I 2025-06-10 14:11:40,627] Trial 17 finished with value: 236.50302677497697 and parameters: {'alpha': 0.17017589392765298, 'beta': 0.031110473407760297}. Best is trial 6 with value: 136.4525672367332.
[I 202

saved: 13_RL_agent_TDlearn_output\plots\plot_19_11_2024_19_42_32.pdf


[I 2025-06-10 14:12:06,756] Trial 0 finished with value: 169.29556356704117 and parameters: {'alpha': 0.7046981470002124, 'beta': 0.7618864665446416}. Best is trial 0 with value: 169.29556356704117.
[I 2025-06-10 14:12:06,795] Trial 6 finished with value: 159.32971353256983 and parameters: {'alpha': 0.20802007486967256, 'beta': 0.6170175429458784}. Best is trial 6 with value: 159.32971353256983.
[I 2025-06-10 14:12:06,832] Trial 15 finished with value: 170.43015504088726 and parameters: {'alpha': 0.4431421841427975, 'beta': 0.8333605333384447}. Best is trial 6 with value: 159.32971353256983.
[I 2025-06-10 14:12:06,836] Trial 1 finished with value: 250.4199310710454 and parameters: {'alpha': 0.8170367864848664, 'beta': 0.10702872252567452}. Best is trial 6 with value: 159.32971353256983.
[I 2025-06-10 14:12:06,872] Trial 28 finished with value: 168.2861568581838 and parameters: {'alpha': 0.7167358022882276, 'beta': 0.48727467566971483}. Best is trial 6 with value: 159.32971353256983.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_09_23_29.pdf


[I 2025-06-10 14:12:31,522] Trial 28 finished with value: 179.13730276623141 and parameters: {'alpha': 0.4613052312789675, 'beta': 0.9314736796059464}. Best is trial 28 with value: 179.13730276623141.
[I 2025-06-10 14:12:31,545] Trial 26 finished with value: 161.9579551875018 and parameters: {'alpha': 0.6017176456226223, 'beta': 0.513638659034079}. Best is trial 26 with value: 161.9579551875018.
[I 2025-06-10 14:12:31,560] Trial 25 finished with value: 162.6197671714151 and parameters: {'alpha': 0.6388702779295605, 'beta': 0.48734662240308646}. Best is trial 26 with value: 161.9579551875018.
[I 2025-06-10 14:12:31,747] Trial 0 finished with value: 162.6351064118775 and parameters: {'alpha': 0.64021926367262, 'beta': 0.6591157018431195}. Best is trial 26 with value: 161.9579551875018.
[I 2025-06-10 14:12:31,763] Trial 4 finished with value: 168.29312895644642 and parameters: {'alpha': 0.98348809083897, 'beta': 0.38305588353801534}. Best is trial 26 with value: 161.9579551875018.
[I 2025

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_14_51_17.pdf


[I 2025-06-10 14:12:56,633] Trial 0 finished with value: 151.32239683444783 and parameters: {'alpha': 0.1964764129513823, 'beta': 0.48086564138229404}. Best is trial 0 with value: 151.32239683444783.
[I 2025-06-10 14:12:57,067] Trial 6 finished with value: 204.5518500199924 and parameters: {'alpha': 0.5223984193293143, 'beta': 0.12439263102371156}. Best is trial 0 with value: 151.32239683444783.
[I 2025-06-10 14:12:57,067] Trial 3 finished with value: 238.0333977945191 and parameters: {'alpha': 0.6965161079950486, 'beta': 0.07050541664291377}. Best is trial 0 with value: 151.32239683444783.
[I 2025-06-10 14:12:57,067] Trial 4 finished with value: 185.67100767084145 and parameters: {'alpha': 0.31474321195773247, 'beta': 0.9890074655846633}. Best is trial 0 with value: 151.32239683444783.
[I 2025-06-10 14:12:57,083] Trial 2 finished with value: 220.41368417522895 and parameters: {'alpha': 0.009400114330459619, 'beta': 0.5263673180938484}. Best is trial 0 with value: 151.32239683444783.
[

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_14_56.pdf


[I 2025-06-10 14:13:22,521] Trial 21 finished with value: 314.69282781141584 and parameters: {'alpha': 0.5128047438033776, 'beta': 0.04112374158858889}. Best is trial 21 with value: 314.69282781141584.
[I 2025-06-10 14:13:22,538] Trial 9 finished with value: 156.39423251544332 and parameters: {'alpha': 0.37123937830545795, 'beta': 0.4585976265996281}. Best is trial 9 with value: 156.39423251544332.
[I 2025-06-10 14:13:22,791] Trial 27 finished with value: 165.39826981685866 and parameters: {'alpha': 0.7272205635962865, 'beta': 0.6853813183614608}. Best is trial 9 with value: 156.39423251544332.
[I 2025-06-10 14:13:22,807] Trial 1 finished with value: 164.07819810323 and parameters: {'alpha': 0.9334810604786865, 'beta': 0.590141968772942}. Best is trial 9 with value: 156.39423251544332.
[I 2025-06-10 14:13:22,807] Trial 8 finished with value: 157.09101964281942 and parameters: {'alpha': 0.28190724958246105, 'beta': 0.4040510496141827}. Best is trial 9 with value: 156.39423251544332.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_15_41_35.pdf


[I 2025-06-10 14:13:46,908] Trial 0 finished with value: 198.89284708234868 and parameters: {'alpha': 0.7289996323562912, 'beta': 0.333401048455945}. Best is trial 0 with value: 198.89284708234868.
[I 2025-06-10 14:13:46,958] Trial 6 finished with value: 173.96629971431204 and parameters: {'alpha': 0.9938643924028061, 'beta': 0.715245130536448}. Best is trial 6 with value: 173.96629971431204.
[I 2025-06-10 14:13:46,977] Trial 12 finished with value: 243.91123458994446 and parameters: {'alpha': 0.7462402714551664, 'beta': 0.1723889026088118}. Best is trial 6 with value: 173.96629971431204.
[I 2025-06-10 14:13:46,978] Trial 26 finished with value: 178.2107480472317 and parameters: {'alpha': 0.636361652402525, 'beta': 0.8647436258263091}. Best is trial 6 with value: 173.96629971431204.
[I 2025-06-10 14:13:46,979] Trial 22 finished with value: 320.25583929298745 and parameters: {'alpha': 0.3703748539741323, 'beta': 0.07361675544840063}. Best is trial 6 with value: 173.96629971431204.
[I 20

saved: 13_RL_agent_TDlearn_output\plots\plot_20_11_2024_16_58_23.pdf


[I 2025-06-10 14:14:11,898] Trial 31 finished with value: 163.73285950641161 and parameters: {'alpha': 0.2150419034182679, 'beta': 0.29852933239211543}. Best is trial 31 with value: 163.73285950641161.
[I 2025-06-10 14:14:11,914] Trial 5 finished with value: 172.39268004965118 and parameters: {'alpha': 0.3991789354558708, 'beta': 0.8665625243677169}. Best is trial 31 with value: 163.73285950641161.
[I 2025-06-10 14:14:11,929] Trial 14 finished with value: 207.5175991905582 and parameters: {'alpha': 0.31303278292956527, 'beta': 0.1238405525898506}. Best is trial 31 with value: 163.73285950641161.
[I 2025-06-10 14:14:11,929] Trial 29 finished with value: 168.8342242936057 and parameters: {'alpha': 0.6360741801440759, 'beta': 0.5006007278699789}. Best is trial 31 with value: 163.73285950641161.
[I 2025-06-10 14:14:11,959] Trial 19 finished with value: 185.66285621907033 and parameters: {'alpha': 0.3328105540042121, 'beta': 0.9882383611161035}. Best is trial 31 with value: 163.732859506411

saved: 13_RL_agent_TDlearn_output\plots\plot_22_03_2025_00_10_37.pdf


[I 2025-06-10 14:14:37,165] Trial 5 finished with value: 126.92428894300332 and parameters: {'alpha': 0.13468887230230564, 'beta': 0.40666078756946145}. Best is trial 5 with value: 126.92428894300332.
[I 2025-06-10 14:14:37,210] Trial 26 finished with value: 139.33956341072437 and parameters: {'alpha': 0.30808298841027193, 'beta': 0.22769344355913845}. Best is trial 5 with value: 126.92428894300332.
[I 2025-06-10 14:14:37,223] Trial 30 finished with value: 147.20529838839622 and parameters: {'alpha': 0.6052164613760508, 'beta': 0.33682314831564997}. Best is trial 5 with value: 126.92428894300332.
[I 2025-06-10 14:14:37,286] Trial 18 finished with value: 138.28596379716006 and parameters: {'alpha': 0.4086063340231606, 'beta': 0.5055012238852936}. Best is trial 5 with value: 126.92428894300332.
[I 2025-06-10 14:14:37,351] Trial 0 finished with value: 179.56863082290408 and parameters: {'alpha': 0.48788096052725227, 'beta': 0.9492117975280608}. Best is trial 5 with value: 126.924288943003

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_12_34_30.pdf


[I 2025-06-10 14:15:02,769] Trial 1 finished with value: 181.80580190530026 and parameters: {'alpha': 0.26512154331822, 'beta': 0.25101598541366266}. Best is trial 1 with value: 181.80580190530026.
[I 2025-06-10 14:15:02,790] Trial 15 finished with value: 185.67630478770292 and parameters: {'alpha': 0.9568178388536772, 'beta': 0.9863782440691402}. Best is trial 1 with value: 181.80580190530026.
[I 2025-06-10 14:15:02,792] Trial 24 finished with value: 258.6065762705252 and parameters: {'alpha': 0.24007427199884523, 'beta': 0.06976472475479423}. Best is trial 1 with value: 181.80580190530026.
[I 2025-06-10 14:15:02,792] Trial 8 finished with value: 163.2117127178603 and parameters: {'alpha': 0.24952770031093185, 'beta': 0.7106671398904283}. Best is trial 8 with value: 163.2117127178603.
[I 2025-06-10 14:15:02,792] Trial 9 finished with value: 169.7287715055977 and parameters: {'alpha': 0.7444192582428503, 'beta': 0.4187286288568499}. Best is trial 8 with value: 163.2117127178603.
[I 202

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_14_36_42.pdf


[I 2025-06-10 14:15:28,641] Trial 0 finished with value: 220.24081502638592 and parameters: {'alpha': 0.017179165723343215, 'beta': 0.36796747972001265}. Best is trial 0 with value: 220.24081502638592.
[I 2025-06-10 14:15:28,706] Trial 11 finished with value: 259.3572276520426 and parameters: {'alpha': 0.9359519834913754, 'beta': 0.053129068032872095}. Best is trial 0 with value: 220.24081502638592.
[I 2025-06-10 14:15:28,707] Trial 26 finished with value: 164.3009765301465 and parameters: {'alpha': 0.35774905471454255, 'beta': 0.3089420835274892}. Best is trial 26 with value: 164.3009765301465.
[I 2025-06-10 14:15:28,707] Trial 31 finished with value: 171.28187226493003 and parameters: {'alpha': 0.13824334969036456, 'beta': 0.23260293187967718}. Best is trial 26 with value: 164.3009765301465.
[I 2025-06-10 14:15:28,707] Trial 16 finished with value: 580.4970686290612 and parameters: {'alpha': 0.7000290107314665, 'beta': 0.000500976023211086}. Best is trial 26 with value: 164.300976530

saved: 13_RL_agent_TDlearn_output\plots\plot_22_11_2024_15_19_47.pdf


[I 2025-06-10 14:15:53,752] Trial 4 finished with value: 171.5288011331922 and parameters: {'alpha': 0.06123378262848855, 'beta': 0.8505647644543269}. Best is trial 4 with value: 171.5288011331922.
[I 2025-06-10 14:15:54,448] Trial 3 finished with value: 192.97363147481127 and parameters: {'alpha': 0.136093632675666, 'beta': 0.1328992907124843}. Best is trial 4 with value: 171.5288011331922.
[I 2025-06-10 14:15:54,466] Trial 13 finished with value: 176.8331992314691 and parameters: {'alpha': 0.673432082051934, 'beta': 0.9145599162444463}. Best is trial 4 with value: 171.5288011331922.
[I 2025-06-10 14:15:54,497] Trial 1 finished with value: 165.65752841971604 and parameters: {'alpha': 0.4823238223328109, 'beta': 0.8065925140641877}. Best is trial 1 with value: 165.65752841971604.
[I 2025-06-10 14:15:54,501] Trial 7 finished with value: 180.3938859588793 and parameters: {'alpha': 0.6960653846998358, 'beta': 0.9458713569850187}. Best is trial 1 with value: 165.65752841971604.
[I 2025-06-

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_07_37_11.pdf


[I 2025-06-10 14:16:19,411] Trial 16 finished with value: 210.57697810524922 and parameters: {'alpha': 0.5673180704839801, 'beta': 0.18678541374228164}. Best is trial 16 with value: 210.57697810524922.
[I 2025-06-10 14:16:19,428] Trial 6 finished with value: 182.4694041222089 and parameters: {'alpha': 0.7483797178013681, 'beta': 0.9540544031564456}. Best is trial 6 with value: 182.4694041222089.
[I 2025-06-10 14:16:19,632] Trial 29 finished with value: 167.5432798657644 and parameters: {'alpha': 0.5581110287293078, 'beta': 0.4782519350657689}. Best is trial 29 with value: 167.5432798657644.
[I 2025-06-10 14:16:19,719] Trial 0 finished with value: 161.56870206809725 and parameters: {'alpha': 0.36594713049063254, 'beta': 0.6024755120304041}. Best is trial 0 with value: 161.56870206809725.
[I 2025-06-10 14:16:19,741] Trial 1 finished with value: 158.91839840139312 and parameters: {'alpha': 0.20882834756321167, 'beta': 0.5031726755225884}. Best is trial 1 with value: 158.91839840139312.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_12_11_10.pdf


[I 2025-06-10 14:16:45,426] Trial 18 finished with value: 173.31827709721603 and parameters: {'alpha': 0.9402656512770218, 'beta': 0.8027182398099428}. Best is trial 18 with value: 173.31827709721603.
[I 2025-06-10 14:16:45,440] Trial 28 finished with value: 323.1634626060212 and parameters: {'alpha': 0.4686587314802364, 'beta': 0.05148208704767676}. Best is trial 18 with value: 173.31827709721603.
[I 2025-06-10 14:16:45,456] Trial 30 finished with value: 185.64947298160158 and parameters: {'alpha': 0.9744899009925689, 'beta': 0.9840141856262709}. Best is trial 18 with value: 173.31827709721603.
[I 2025-06-10 14:16:45,456] Trial 11 finished with value: 246.13685498094503 and parameters: {'alpha': 0.2692879912172498, 'beta': 0.11823718181202858}. Best is trial 18 with value: 173.31827709721603.
[I 2025-06-10 14:16:45,594] Trial 0 finished with value: 170.15995744818923 and parameters: {'alpha': 0.3578742884848348, 'beta': 0.45124498501534205}. Best is trial 0 with value: 170.15995744818

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_18_41_38.pdf


[I 2025-06-10 14:17:10,656] Trial 6 finished with value: 166.14466938626168 and parameters: {'alpha': 0.46033718913764776, 'beta': 0.515747959245523}. Best is trial 6 with value: 166.14466938626168.
[I 2025-06-10 14:17:11,328] Trial 16 finished with value: 178.48917502865328 and parameters: {'alpha': 0.36138186843061426, 'beta': 0.3420683330995472}. Best is trial 6 with value: 166.14466938626168.
[I 2025-06-10 14:17:11,334] Trial 0 finished with value: 201.35100729018308 and parameters: {'alpha': 0.7799844529595982, 'beta': 0.23400268853578143}. Best is trial 6 with value: 166.14466938626168.
[I 2025-06-10 14:17:11,352] Trial 1 finished with value: 177.5891127059803 and parameters: {'alpha': 0.13127254339430272, 'beta': 0.4051895157521027}. Best is trial 6 with value: 166.14466938626168.
[I 2025-06-10 14:17:11,361] Trial 13 finished with value: 170.4535831090096 and parameters: {'alpha': 0.3442422804236518, 'beta': 0.8015964912778132}. Best is trial 6 with value: 166.14466938626168.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_25_11_2024_20_12_41.pdf


[I 2025-06-10 14:17:35,644] Trial 0 finished with value: 185.13285652225454 and parameters: {'alpha': 0.32379762050352967, 'beta': 0.9849461431407062}. Best is trial 0 with value: 185.13285652225454.
[I 2025-06-10 14:17:35,761] Trial 17 finished with value: 156.73609109590407 and parameters: {'alpha': 0.7642945684555887, 'beta': 0.44651602940530855}. Best is trial 17 with value: 156.73609109590407.
[I 2025-06-10 14:17:35,993] Trial 1 finished with value: 185.51147704455957 and parameters: {'alpha': 0.87544406880325, 'beta': 0.9870242150027912}. Best is trial 17 with value: 156.73609109590407.
[I 2025-06-10 14:17:36,098] Trial 2 finished with value: 165.28504937224446 and parameters: {'alpha': 0.08689322487433848, 'beta': 0.7868385625325711}. Best is trial 17 with value: 156.73609109590407.
[I 2025-06-10 14:17:36,140] Trial 14 finished with value: 160.52656976225546 and parameters: {'alpha': 0.7017001062279621, 'beta': 0.36982332054518297}. Best is trial 17 with value: 156.7360910959040

saved: 13_RL_agent_TDlearn_output\plots\plot_26_03_2025_16_21_25.pdf


[I 2025-06-10 14:18:01,717] Trial 19 finished with value: 306.0477473919396 and parameters: {'alpha': 0.08342069950927433, 'beta': 0.09280614670916776}. Best is trial 19 with value: 306.0477473919396.
[I 2025-06-10 14:18:01,829] Trial 18 finished with value: 358.7155699666695 and parameters: {'alpha': 0.10811054971620904, 'beta': 0.0519206051014941}. Best is trial 19 with value: 306.0477473919396.
[I 2025-06-10 14:18:01,849] Trial 1 finished with value: 184.3920935991371 and parameters: {'alpha': 0.36986441199862063, 'beta': 0.47071872667945736}. Best is trial 1 with value: 184.3920935991371.
[I 2025-06-10 14:18:01,970] Trial 0 finished with value: 176.35405355127213 and parameters: {'alpha': 0.1156823599134923, 'beta': 0.7363972990798077}. Best is trial 0 with value: 176.35405355127213.
[I 2025-06-10 14:18:02,075] Trial 15 finished with value: 353.1068174090832 and parameters: {'alpha': 0.03821203385594607, 'beta': 0.05732956308353965}. Best is trial 0 with value: 176.35405355127213.


saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_10_53_23.pdf


[I 2025-06-10 14:18:27,346] Trial 9 finished with value: 153.78677575842286 and parameters: {'alpha': 0.3067361943684185, 'beta': 0.7430108021919318}. Best is trial 9 with value: 153.78677575842286.
[I 2025-06-10 14:18:27,362] Trial 5 finished with value: 165.007900521974 and parameters: {'alpha': 0.030866291096437348, 'beta': 0.8080576680652789}. Best is trial 9 with value: 153.78677575842286.
[I 2025-06-10 14:18:27,381] Trial 31 finished with value: 174.9217842343071 and parameters: {'alpha': 0.06068704486373433, 'beta': 0.09528463265556786}. Best is trial 9 with value: 153.78677575842286.
[I 2025-06-10 14:18:27,387] Trial 6 finished with value: 180.0550796443663 and parameters: {'alpha': 0.12722233362443403, 'beta': 0.9545110671709879}. Best is trial 9 with value: 153.78677575842286.
[I 2025-06-10 14:18:27,625] Trial 7 finished with value: 136.6257782942334 and parameters: {'alpha': 0.39673537846747275, 'beta': 0.43397509872313367}. Best is trial 7 with value: 136.6257782942334.
[I 

saved: 13_RL_agent_TDlearn_output\plots\plot_26_11_2024_14_31_40.pdf


[I 2025-06-10 14:18:52,943] Trial 31 finished with value: 181.9422433704632 and parameters: {'alpha': 0.8332943930841828, 'beta': 0.934278140351119}. Best is trial 31 with value: 181.9422433704632.
[I 2025-06-10 14:18:52,998] Trial 11 finished with value: 185.91855770913313 and parameters: {'alpha': 0.719889815598425, 'beta': 0.4042262250039341}. Best is trial 10 with value: 175.21121655059383.
[I 2025-06-10 14:18:52,974] Trial 7 finished with value: 187.0638788620858 and parameters: {'alpha': 0.030768433756662827, 'beta': 0.9985643654752685}. Best is trial 31 with value: 181.9422433704632.
[I 2025-06-10 14:18:52,974] Trial 10 finished with value: 175.21121655059383 and parameters: {'alpha': 0.5864036718315601, 'beta': 0.8106160456963961}. Best is trial 10 with value: 175.21121655059383.
[I 2025-06-10 14:18:52,974] Trial 23 finished with value: 221.10106735089903 and parameters: {'alpha': 0.7392958096337588, 'beta': 0.221994193832883}. Best is trial 31 with value: 181.9422433704632.
[I

saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_12_21_16.pdf


[I 2025-06-10 14:19:17,030] Trial 17 finished with value: 169.44775946936835 and parameters: {'alpha': 0.7217065388742873, 'beta': 0.7652939175017901}. Best is trial 17 with value: 169.44775946936835.
[I 2025-06-10 14:19:17,083] Trial 2 finished with value: 174.6478627995986 and parameters: {'alpha': 0.1788886810220648, 'beta': 0.8565854297291557}. Best is trial 17 with value: 169.44775946936835.
[I 2025-06-10 14:19:17,089] Trial 14 finished with value: 179.022068039408 and parameters: {'alpha': 0.565455320188583, 'beta': 0.9141046982998936}. Best is trial 17 with value: 169.44775946936835.
[I 2025-06-10 14:19:17,104] Trial 19 finished with value: 170.5514740321493 and parameters: {'alpha': 0.5421015217287679, 'beta': 0.7882302194120081}. Best is trial 17 with value: 169.44775946936835.
[I 2025-06-10 14:19:17,196] Trial 20 finished with value: 563.9343415484256 and parameters: {'alpha': 0.012029808916497964, 'beta': 0.0292527077183673}. Best is trial 17 with value: 169.44775946936835.


saved: 13_RL_agent_TDlearn_output\plots\plot_28_11_2024_22_38_25.pdf


# now saving the model evaluation values

In [14]:
df_models_evaluation = pd.DataFrame({
    "participants": participants,
    "best_alpha": best_alpha_models,
    "best_beta": best_beta_models,
    "BIC": BIC_models,
    "AIC": AIC_models,
    "accuracy": accuracy_models,
    "precision": precision_models,
    "sensitivity_recall": sensitivity_recall_models,
    "specificity": specificity_models,
    "f1_score": f1_score_models,
    "mcFadden_r2": mcFadden_r2_models,
    "r2": r2_models
})

file_path = os.path.join(output_dir_model_evaluation, "models_evaluation.csv")
df_models_evaluation.to_csv(file_path, index=False)