# param recovery for both model

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import seaborn as sns
from sklearn.metrics import confusion_matrix
from joblib import Parallel, delayed
import matplotlib.tri as tri
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
from scipy.interpolate import RBFInterpolator
import matplotlib.ticker as mticker
import itertools
from sklearn.metrics import r2_score
import optuna


# important directories

In [2]:
output_dir = "27_RL_agent_TDlearn_output_both_param_recovery"
os.makedirs(output_dir, exist_ok=True)


folder_path_participants = 'data_risk_added_epileptic'
folder_path_colors_numbers = '13_RL_agent_TDlearn_output/model_behavior'


df_participants = []
df_colors_numbers = []


def find_matching_csv(folder_path, df_list):
            for csv_file in os.listdir(folder_path):
                if clean_name in csv_file and csv_file.endswith('.csv'):
                    csv_path = os.path.join(folder_path, csv_file)
                    df_csv = pd.read_csv(csv_path)
                    df_list.append(df_csv)





for file_name in os.listdir(folder_path_participants):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path_participants, file_name)
        df = pd.read_csv(file_path)
        df = df[df['outcome'].str.lower() != 'na'].reset_index(drop=True) 
        df_participants.append(df)

        clean_name = file_name.removeprefix("task_data_").removesuffix(".csv")
        find_matching_csv(folder_path_colors_numbers, df_colors_numbers)


In [3]:
for df in df_participants:
    df['block_type'] = None

    df.loc[df['block'] == 1, 'block_type'] = 'uniform'     # Block 1 is uni
    df.loc[df['block'] == 4, 'block_type'] = 'mix'     # Block 4 is mix

    # For blocks 2 and 3, set based on distribution
    df.loc[(df['block'] == 2) & (df['distribution'] == 'low'), 'block_type'] = 'low'
    df.loc[(df['block'] == 2) & (df['distribution'] == 'high'), 'block_type'] = 'high'
    df.loc[(df['block'] == 3) & (df['distribution'] == 'low'), 'block_type'] = 'low'
    df.loc[(df['block'] == 3) & (df['distribution'] == 'high'), 'block_type'] = 'high'
    



for i in range(len(df_participants)):
    myCard = df_participants[i]['myCard']
    yourCard = df_participants[i]['yourCard']
    distributions = df_participants[i]['distribution']
    block_type = df_participants[i]['block_type']
    
    for df_list in [ df_colors_numbers]:
        df_list[i]['myCard'] = myCard
        df_list[i]['yourCard'] = yourCard
        df_list[i]['distribution'] = distributions
        df_list[i]['block_type'] = block_type

In [4]:
for df in df_colors_numbers:
    df['model_choices'] = df['model_choices'].replace({1: 'arrowup', 0: 'arrowdown'})


In [5]:
for df in df_colors_numbers:
    outcomes = []
    for i in range(len(df)):
        my = df.loc[i, 'myCard']
        your = df.loc[i, 'yourCard']
        choice = df.loc[i, 'model_choices']
        
        if ((my > your and choice == 'arrowup') or (my < your and choice == 'arrowdown')):
            outcomes.append('win')
        else:
            outcomes.append('lose')
    
    df['outcome'] = outcomes

In [6]:
df_participants[2]

Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,is_within_IQR,risk,block_type
0,2135,uniform,835,win,4,7,14079,10.5,0,response,arrowdown,1,0,0,0.375,uniform
1,1203,uniform,926,win,9,4,1804,11,1,response,arrowup,1,0,1,0.000,uniform
2,1035,uniform,934,win,1,9,766,11.5,2,response,arrowdown,1,0,1,0.000,uniform
3,827,uniform,991,lose,2,1,1189,11,3,response,arrowdown,1,0,1,0.125,uniform
4,1306,uniform,970,win,9,3,1887,11.5,4,response,arrowup,1,0,1,0.000,uniform
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,939,uniform,773,win,4,6,802,87,24,response,arrowdown,4,0,1,0.375,mix
266,1210,high,811,win,6,8,1345,87.5,122,response,arrowdown,4,0,1,0.385,mix
267,275,high,838,win,3,5,254,88,123,response,arrowdown,4,0,1,0.071,mix
268,907,uniform,992,win,2,4,774,88.5,14,response,arrowdown,4,0,1,0.125,mix


In [7]:
df_colors_numbers[2]

Unnamed: 0,model_choices,participant_choices,model_total_reward,participant_total_reward,q_val,myCard,yourCard,distribution,block_type,outcome
0,arrowup,0,9.5,10.5,"[[[0.007151827818520263, 0.010887971741451993]...",4,7,uniform,uniform,lose
1,arrowup,1,10.0,11.0,"[[[0.007151827818520263, 0.010887971741451993]...",9,4,uniform,uniform,win
2,arrowdown,0,10.5,11.5,"[[[0.04853290651797284, 0.010887971741451993],...",1,9,uniform,uniform,win
3,arrowup,0,11.0,11.0,"[[[0.04853290651797284, 0.010887971741451993],...",2,1,uniform,uniform,win
4,arrowup,1,11.5,11.5,"[[[0.04853290651797284, 0.010887971741451993],...",9,3,uniform,uniform,win
...,...,...,...,...,...,...,...,...,...,...
265,arrowdown,0,63.0,87.0,"[[[0.3121740438868729, 0.010887971741451993], ...",4,6,uniform,mix,win
266,arrowdown,0,63.5,87.5,"[[[0.3121740438868729, 0.010887971741451993], ...",6,8,high,mix,win
267,arrowdown,0,64.0,88.0,"[[[0.3121740438868729, 0.010887971741451993], ...",3,5,high,mix,win
268,arrowdown,0,64.5,88.5,"[[[0.3121740438868729, 0.010887971741451993], ...",2,4,uniform,mix,win


In [8]:
participants = [os.path.splitext(file)[0].replace("task_data_", "")
    for file in os.listdir(folder_path_participants) if file.endswith('.csv')]

In [9]:
actions = { "arrowdown": 0, "arrowup": 1}
distributions_map = { "uniform": 0, "low": 1,  "high": 2}
card_numbers = list(range(1, 10))

# policy_table = percentage_matrix 


Q_table_init = np.random.normal(0, 0.01, (len(card_numbers), len(distributions_map), len(actions)))


Q_table = Q_table_init.copy()

#############################################################################################
# having a q-table that starts with 0! this was not a good initilization so i changed it.
# Q_table = np.zeros((len(distributions_map), len(actions)))  # 3 distributions × 2 actions
#############################################################################################

# print("policy: \n",np.shape(policy_table))
print("\n Q_table: \n",np.shape(Q_table))




 Q_table: 
 (9, 3, 2)


In [10]:
def epsilon_greedy(Q_values, epsilon):    
    probs = np.full_like(Q_values, epsilon / Q_values.shape[-1], dtype=float)  # initialize with exploration probability
    best_actions = np.argmax(Q_values, axis=-1)  # find the best action for each state
    np.put_along_axis(probs, np.expand_dims(best_actions, axis=-1), 1 - epsilon + (epsilon / Q_values.shape[-1]), axis=-1)
    return probs



def train_rescorla_wagner(df, alpha, beta, Q_init=None):
    if Q_init is None:
        Q_init = Q_table.copy()
    Q_values = Q_init.copy()
    q_value_pairs = []
    choices = []
    predicted_probs = []
    distributions = []
    card_numbers = []
    
    for _, row in df.iterrows():
        action = actions[row["model_choices"]] 
        distribution = distributions_map[row["distribution"]] 
        card_number = row["myCard"]-1 # since I'm using this as an index! I need to do -1 to make the 1 to 9 cards come to 0 to 8
        reward = 0.5 if row["outcome"] == "win" else -0.5


        probs = epsilon_greedy(Q_values, beta)
        predicted_probs.append(probs[card_number][distribution][action])
        
        prediction_error = reward - Q_values[card_number][distribution][action]
        Q_values[card_number][distribution][action] += alpha * prediction_error
        
        q_value_pairs.append(Q_values.copy())
        choices.append(action)
        distributions.append(distribution)
        card_numbers.append(card_number)
        

    return np.array(q_value_pairs), np.array(choices), np.array(predicted_probs), np.array(distributions), np.array(card_numbers)


# this is for the sake of parallel computing
def compute_log_likelihood(alpha, beta, df_all, Q_table):
    Q_init_participant = Q_table.copy()
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, alpha, beta, Q_init=Q_init_participant.copy())
    
    predicted_probs = np.clip(predicted_probs, 1e-6, 1)  # prevent log(0)
    log_likelihood = np.sum(np.log(predicted_probs))
    
    return (alpha, beta, log_likelihood)


In [11]:
num_of_samples = 100
# num_of_samples = 1000
alpha_min = 0.01
alpha_max = 1
beta_min = 0.01
beta_max  = 1
alpha_samples = np.random.uniform(alpha_min, alpha_max + np.finfo(float).eps, num_of_samples)
beta_samples = np.random.uniform(beta_min, beta_max + np.finfo(float).eps, num_of_samples)

In [12]:
BIC_models = []
AIC_models = []
best_alpha_models = []
best_beta_models = []
accuracy_models = []
precision_models = []
sensitivity_recall_models = []
specificity_models = []
f1_score_models = []
mcFadden_r2_models = []
r2_models = []

for idx, df_all in enumerate(df_colors_numbers):
    print(f"Processing participant {idx + 1} of {len(df_colors_numbers)}")
    Q_init_participant = Q_table.copy()
    
    def objective(trial):
        alpha = trial.suggest_float("alpha", alpha_min, alpha_max)
        beta  = trial.suggest_float("beta", beta_min, beta_max)

        # negative log-likelihood (Optuna minimises)
        _, _, ll = compute_log_likelihood(alpha, beta,
                                        df_all,
                                        Q_init_participant.copy())
        return -ll

    study = optuna.create_study(direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=400, n_jobs=-1)


    best_alpha = study.best_params["alpha"]
    best_beta  = study.best_params["beta"]
    best_log_likelihood = -study.best_value

    # keep this for plotting later
    results_df = study.trials_dataframe()
    results_df["alpha"] = results_df["params_alpha"]
    results_df["beta"]  = results_df["params_beta"]
    results_df["log_likelihood"] = -results_df["value"]

    # model prediction 
    
    q_values, choices, predicted_probs, distributions, card_numbers = train_rescorla_wagner(df_all, best_alpha, best_beta, Q_init=Q_init_participant.copy())
    
    
    predicted_choices = []
    for trial in range(len(card_numbers)):
        test_action_probs = epsilon_greedy(q_values[trial], best_beta)
        p_arrowup = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowup"]]
        p_arrow_down = test_action_probs[card_numbers[trial]][distributions[trial]][actions["arrowdown"]]
        # choosing 1 or 0 based on the softmax probabilities:
        predicted_choice = int(p_arrowup >= 0.5)  # 1 if up-prob ≥ 0.5 else 0
        predicted_choices.append(predicted_choice)

    # finding out model total reward based on the model's predicted choices
    total_reward = [] 
    for i in range(len(predicted_choices)):
        if len(total_reward)> 0:
            last_reward = total_reward[-1]  #  the last reward value
        else:
            last_reward = 10 # initial reward is $10
        
        if ((df_all.loc[i, 'myCard'] > df_all.loc[i, 'yourCard'] and predicted_choices[i] == 1) or
            (df_all.loc[i, 'myCard'] < df_all.loc[i, 'yourCard'] and predicted_choices[i] == 0)):
            total_reward.append(last_reward + 0.5)
        else:
            total_reward.append(last_reward - 0.5)

    
   
       # confusion matrix:
    conf_matrix = confusion_matrix(choices, predicted_choices)
    TN, FP, FN, TP = conf_matrix.ravel()  # unpacking the confusion matrix
    # acc
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision: From the ones that we’ve announced them as up/down, which ones are really up/down?
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall or sensitivity : true positive rate
    sensitivity_recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # specificity : true negative rate
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    # f1 Score
    f1_score = 2 * (precision * sensitivity_recall) / (precision + sensitivity_recall) if (precision + sensitivity_recall) != 0 else 0

    
    # bayes information criterion:
    n_trials = len(df_all)
    k = 2  # number of free parameters: alpha and beta
    BIC = k * np.log(n_trials) - 2 * best_log_likelihood # this is BIC formula based on the log lkelihode I found before

    

    # Akaike  information criterion(AIC):
    AIC = 2 * k - 2 * best_log_likelihood 


    # mcFadden r-squared:
    p_null = np.mean(choices)  # probability of choosing "1" in the dataset
    log_likelihood_null = np.sum(choices * np.log(p_null) + (1 - choices) * np.log(1 - p_null))
    mcFadden_r2 = 1 - (best_log_likelihood / log_likelihood_null)

    # r-squared
    r2 = r2_score(choices, predicted_choices)
    print(best_alpha)
    print(best_beta)
    
    # saving models evaluation variables:
    best_alpha_models.append(best_alpha)
    best_beta_models.append(best_beta)
    BIC_models.append(BIC)
    AIC_models.append(AIC)
    accuracy_models.append(accuracy)
    precision_models.append(precision)
    sensitivity_recall_models.append(sensitivity_recall)
    specificity_models.append(specificity)
    f1_score_models.append(f1_score)
    mcFadden_r2_models.append(mcFadden_r2)
    r2_models.append(r2)





[I 2025-06-18 15:50:52,684] A new study created in memory with name: no-name-2d2bd398-dfa1-45e8-8528-1d87a96650e0
[I 2025-06-18 15:50:52,753] Trial 0 finished with value: 164.96344199782834 and parameters: {'alpha': 0.6835818862853577, 'beta': 0.6137396229724431}. Best is trial 0 with value: 164.96344199782834.
[I 2025-06-18 15:50:52,820] Trial 1 finished with value: 232.87980930606528 and parameters: {'alpha': 0.5595030879894537, 'beta': 0.1323734201391714}. Best is trial 0 with value: 164.96344199782834.


Processing participant 1 of 7


[I 2025-06-18 15:50:52,943] Trial 2 finished with value: 167.19251523154963 and parameters: {'alpha': 0.8320287383136853, 'beta': 0.4863299267620073}. Best is trial 0 with value: 164.96344199782834.
[I 2025-06-18 15:50:52,946] Trial 3 finished with value: 164.35447096332288 and parameters: {'alpha': 0.1842225308395812, 'beta': 0.6713971755614194}. Best is trial 3 with value: 164.35447096332288.
[I 2025-06-18 15:50:53,370] Trial 4 finished with value: 167.80194361923142 and parameters: {'alpha': 0.4913666699873027, 'beta': 0.44927709918158276}. Best is trial 3 with value: 164.35447096332288.
[I 2025-06-18 15:50:53,391] Trial 6 finished with value: 175.5708141146792 and parameters: {'alpha': 0.919927215510076, 'beta': 0.873218466179969}. Best is trial 3 with value: 164.35447096332288.
[I 2025-06-18 15:50:53,449] Trial 10 finished with value: 180.61154759463864 and parameters: {'alpha': 0.059348528971724004, 'beta': 0.9341196743775612}. Best is trial 3 with value: 164.35447096332288.
[I 2

0.17102974855931385
0.5773280634486041
Processing participant 2 of 7


[I 2025-06-18 15:51:20,704] Trial 3 finished with value: 341.03272093015255 and parameters: {'alpha': 0.3763900385499399, 'beta': 0.039573701950008415}. Best is trial 0 with value: 174.68097748526407.
[I 2025-06-18 15:51:20,808] Trial 7 finished with value: 179.93051867418336 and parameters: {'alpha': 0.5515110433472187, 'beta': 0.9097844255902421}. Best is trial 0 with value: 174.68097748526407.
[I 2025-06-18 15:51:20,813] Trial 4 finished with value: 170.58637691668235 and parameters: {'alpha': 0.06086735323143235, 'beta': 0.7574360231994437}. Best is trial 4 with value: 170.58637691668235.
[I 2025-06-18 15:51:20,937] Trial 8 finished with value: 213.2467639668497 and parameters: {'alpha': 0.56018648600928, 'beta': 0.23483644806780604}. Best is trial 4 with value: 170.58637691668235.
[I 2025-06-18 15:51:21,151] Trial 10 finished with value: 332.5839984416768 and parameters: {'alpha': 0.8028035113620159, 'beta': 0.05018337950525685}. Best is trial 4 with value: 170.58637691668235.
[I 

0.03807621964774212
0.6228363266245037
Processing participant 3 of 7


[I 2025-06-18 15:51:48,267] Trial 5 finished with value: 175.93620972540688 and parameters: {'alpha': 0.12030238729775279, 'beta': 0.8894827676695302}. Best is trial 1 with value: 172.8066925628397.
[I 2025-06-18 15:51:48,332] Trial 4 finished with value: 164.57658372461432 and parameters: {'alpha': 0.6077653301523144, 'beta': 0.6490921099374557}. Best is trial 4 with value: 164.57658372461432.
[I 2025-06-18 15:51:48,506] Trial 8 finished with value: 306.99553437492125 and parameters: {'alpha': 0.9593770198870711, 'beta': 0.04551933177294444}. Best is trial 4 with value: 164.57658372461432.
[I 2025-06-18 15:51:48,509] Trial 7 finished with value: 186.7214539437133 and parameters: {'alpha': 0.3215505251709702, 'beta': 0.2756312250642803}. Best is trial 4 with value: 164.57658372461432.
[I 2025-06-18 15:51:48,558] Trial 11 finished with value: 177.66661827251568 and parameters: {'alpha': 0.4298292667864785, 'beta': 0.33585500351789155}. Best is trial 4 with value: 164.57658372461432.
[I 

0.3837153863405248
0.5862789029952634
Processing participant 4 of 7


[I 2025-06-18 15:52:16,663] Trial 2 finished with value: 241.75312399096222 and parameters: {'alpha': 0.3685989937785419, 'beta': 0.2755493112759786}. Best is trial 1 with value: 216.64099218245536.
[I 2025-06-18 15:52:16,873] Trial 6 finished with value: 188.16215335601467 and parameters: {'alpha': 0.43030983710551923, 'beta': 0.6075989587972482}. Best is trial 6 with value: 188.16215335601467.
[I 2025-06-18 15:52:16,876] Trial 4 finished with value: 212.7405661865931 and parameters: {'alpha': 0.3524300873962027, 'beta': 0.39207520154755493}. Best is trial 6 with value: 188.16215335601467.
[I 2025-06-18 15:52:16,957] Trial 8 finished with value: 257.47301279299404 and parameters: {'alpha': 0.6831660058103495, 'beta': 0.2352293342065868}. Best is trial 6 with value: 188.16215335601467.
[I 2025-06-18 15:52:16,968] Trial 7 finished with value: 192.82442508139502 and parameters: {'alpha': 0.4774317883586942, 'beta': 0.5572349144422498}. Best is trial 6 with value: 188.16215335601467.
[I 2

0.12964318183234896
0.7703426190964238
Processing participant 5 of 7


[I 2025-06-18 15:52:44,910] Trial 1 finished with value: 182.1845882083602 and parameters: {'alpha': 0.7485732366343146, 'beta': 0.9273011098661424}. Best is trial 1 with value: 182.1845882083602.
[I 2025-06-18 15:52:44,940] Trial 0 finished with value: 217.93068160943184 and parameters: {'alpha': 0.9069426736896268, 'beta': 0.26821214673589894}. Best is trial 1 with value: 182.1845882083602.
[I 2025-06-18 15:52:45,015] Trial 4 finished with value: 180.17787866592423 and parameters: {'alpha': 0.8152826965021213, 'beta': 0.8899905138851469}. Best is trial 4 with value: 180.17787866592423.
[I 2025-06-18 15:52:45,065] Trial 3 finished with value: 371.6629181204715 and parameters: {'alpha': 0.6444380365675038, 'beta': 0.04334209616192507}. Best is trial 4 with value: 180.17787866592423.
[I 2025-06-18 15:52:45,120] Trial 7 finished with value: 183.19448978950538 and parameters: {'alpha': 0.8522325551784833, 'beta': 0.49929688948824613}. Best is trial 4 with value: 180.17787866592423.
[I 202

0.08452781655506755
0.6512860617412161
Processing participant 6 of 7


[I 2025-06-18 15:53:12,621] Trial 4 finished with value: 176.3750096823814 and parameters: {'alpha': 0.0942716811084626, 'beta': 0.6971596840670039}. Best is trial 4 with value: 176.3750096823814.
[I 2025-06-18 15:53:12,820] Trial 7 finished with value: 359.5463923520106 and parameters: {'alpha': 0.07788956593622931, 'beta': 0.051455901160295206}. Best is trial 4 with value: 176.3750096823814.
[I 2025-06-18 15:53:12,823] Trial 6 finished with value: 176.55644810309335 and parameters: {'alpha': 0.8340748170807462, 'beta': 0.6370943031432316}. Best is trial 4 with value: 176.3750096823814.
[I 2025-06-18 15:53:12,884] Trial 8 finished with value: 189.67602673119438 and parameters: {'alpha': 0.28891266108209956, 'beta': 0.4650107027704921}. Best is trial 4 with value: 176.3750096823814.
[I 2025-06-18 15:53:13,046] Trial 9 finished with value: 392.96849660770437 and parameters: {'alpha': 0.36142277403313533, 'beta': 0.035946354174921966}. Best is trial 4 with value: 176.3750096823814.
[I 20

0.4424385839704874
0.6978964823757422
Processing participant 7 of 7


[I 2025-06-18 15:53:40,504] Trial 3 finished with value: 170.43928614776297 and parameters: {'alpha': 0.2662293059668641, 'beta': 0.6506721097073896}. Best is trial 3 with value: 170.43928614776297.
[I 2025-06-18 15:53:40,738] Trial 6 finished with value: 171.4448267894334 and parameters: {'alpha': 0.6211546420379681, 'beta': 0.7343495770019605}. Best is trial 3 with value: 170.43928614776297.
[I 2025-06-18 15:53:40,749] Trial 4 finished with value: 350.4337045104884 and parameters: {'alpha': 0.543852360887387, 'beta': 0.03883229940277232}. Best is trial 3 with value: 170.43928614776297.
[I 2025-06-18 15:53:40,814] Trial 7 finished with value: 208.6794519362489 and parameters: {'alpha': 0.49579185029643863, 'beta': 0.24450244985001685}. Best is trial 3 with value: 170.43928614776297.
[I 2025-06-18 15:53:40,833] Trial 5 finished with value: 217.87096324129928 and parameters: {'alpha': 0.5740427377880186, 'beta': 0.21206982223939091}. Best is trial 3 with value: 170.43928614776297.
[I 20

0.03658462799833899
0.6291841065498327


# now saving the model evaluation values

In [13]:
df_models_evaluation = pd.DataFrame({
    "participants": participants,
    "best_alpha": best_alpha_models,
    "best_beta": best_beta_models,
    "BIC": BIC_models,
    "AIC": AIC_models,
    "accuracy": accuracy_models,
    "precision": precision_models,
    "sensitivity_recall": sensitivity_recall_models,
    "specificity": specificity_models,
    "f1_score": f1_score_models,
    "mcFadden_r2": mcFadden_r2_models,
    "r2": r2_models
})

file_path = os.path.join(output_dir, "models_evaluation_greedy.csv")
df_models_evaluation.to_csv(file_path, index=False)