In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns 
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import FixedLocator
import statsmodels.api as sm

In [2]:
folder_path = 'data'
dataframes = []


for file_name in os.listdir(folder_path):
    if file_name.endswith('.xlsx'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_excel(file_path)
        
        # Calculate mean and standard deviation of spaceRT
        # Calculate 3*std for spaceRT and add as a new column we do this for knowing the spaceRT outliers and remove them in
        # calculating spaceRT cause these are rest somehow 
        # the spaceRT_2sd is obviously same in all columns cause it's based on all the spaceRTs. I just wanted to have everything
        # in the dataframe
        
        df['spaceRT'] = df['spaceRT'] - 1000

        mean_spaceRT = df['spaceRT'].mean()
        std_spaceRT = df['spaceRT'].std()
        df['spaceRT_2sd'] = mean_spaceRT + 2 * std_spaceRT
        

        dataframes.append(df)

dataframes[5].head(3)

Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,spaceRT_2sd
0,1067,uniform,894,win,3,9,3575,10.5,0,response,arrowdown,1,0,3984.543912
1,237,uniform,925,lose,2,1,903,10.0,1,response,arrowdown,1,0,3984.543912
2,231,uniform,973,win,8,2,1130,10.5,2,response,arrowup,1,0,3984.543912


In [3]:
for df in dataframes:
    df['block_type'] = None

    df.loc[df['block'] == 1, 'block_type'] = 'uniform'     # Block 1 is uni
    df.loc[df['block'] == 4, 'block_type'] = 'mix'     # Block 4 is mix

    # For blocks 2 and 3, set based on distribution
    df.loc[(df['block'] == 2) & (df['distribution'] == 'low'), 'block_type'] = 'low'
    df.loc[(df['block'] == 2) & (df['distribution'] == 'high'), 'block_type'] = 'high'
    df.loc[(df['block'] == 3) & (df['distribution'] == 'low'), 'block_type'] = 'low'
    df.loc[(df['block'] == 3) & (df['distribution'] == 'high'), 'block_type'] = 'high'
    
dataframes[0].head(3)   

Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,spaceRT_2sd,block_type
0,2207,uniform,848,lose,5,7,7655,9.5,0,response,arrowup,1,0,17893.185321,uniform
1,1018,uniform,865,win,3,9,4118,10.0,1,response,arrowdown,1,0,17893.185321,uniform
2,1327,uniform,783,lose,9,1,232,9.5,2,response,arrowdown,1,0,17893.185321,uniform


In [4]:
# 1) Define the risk dictionaries for each deck type
risk_uniform = {
    1: 0.0,
    2: 0.125,
    3: 0.25,
    4: 0.375,
    5: 0.50,
    6: 0.375,
    7: 0.25,
    8: 0.125,
    9: 0.0
}

risk_low = {
    1: 0.000,
    2: 0.243,
    3: 0.447,
    4: 0.385,
    5: 0.250,
    6: 0.146,
    7: 0.071,
    8: 0.023,
    9: 0.000
}

risk_high = {
    1: 0.000,
    2: 0.023,
    3: 0.071,
    4: 0.146,
    5: 0.250,
    6: 0.385,
    7: 0.447,
    8: 0.243,
    9: 0.000
}

# 2) Wrap them in one master dictionary keyed by distribution
risk_map = {
    'uniform': risk_uniform,
    'low':     risk_low,
    'high':    risk_high
    # If you have a 'mix' condition, decide how to handle or skip it
}

# 3) For each DataFrame in your list, create a 'risk' column
for df in dataframes:
    df['risk'] = df.apply(
        lambda row: risk_map.get(row['distribution'], {}).get(row['myCard'], np.nan),
        axis=1
    )
    

dataframes[1]


Unnamed: 0,arrowRT,distribution,interTrialInterval,outcome,myCard,yourCard,spaceRT,totalReward,trialIndex,trialType,choice,block,timeoutRepeat,spaceRT_2sd,block_type,risk
0,na,uniform,755,na,7,4,10400,na,0,timeout,na,1,0,2103.735439,uniform,0.250
1,na,uniform,989,na,4,1,4150,na,1,timeout,na,1,0,2103.735439,uniform,0.375
2,229,uniform,817,lose,4,9,1402,9.5,2,response,arrowup,1,0,2103.735439,uniform,0.375
3,655,uniform,922,lose,6,7,847,9,3,response,arrowup,1,0,2103.735439,uniform,0.375
4,1510,uniform,965,win,2,8,845,9.5,4,response,arrowdown,1,0,2103.735439,uniform,0.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,1546,low,863,win,7,2,8,61,53,response,arrowup,4,0,2103.735439,mix,0.071
271,518,uniform,960,lose,3,1,531,60.5,1,response,arrowdown,4,0,2103.735439,mix,0.250
272,780,uniform,759,win,2,7,210,61,41,response,arrowdown,4,0,2103.735439,mix,0.125
273,791,uniform,756,win,7,2,808,61.5,5,response,arrowup,4,0,2103.735439,mix,0.250


In [5]:
output_folder = 'data_risk_added'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for file_name in os.listdir(folder_path):
    if file_name.endswith('.xlsx'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_excel(file_path)

        df['spaceRT'] = df['spaceRT'] - 1000

        q1 = df['spaceRT'].quantile(0.25)
        q3 = df['spaceRT'].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.2 * iqr
        upper_bound = q3 + 1.2 * iqr
        df['is_within_IQR'] = df['spaceRT'].between(lower_bound, upper_bound).astype(int)


        def compute_risk(row):
            dist = row['distribution']
            card = row['myCard']
            return risk_map.get(dist, {}).get(card, np.nan)

        df['risk'] = df.apply(compute_risk, axis=1)

        # Save
        out_path = os.path.join(output_folder, file_name)
        df.to_excel(out_path, index=False)

print("All files processed and saved in", output_folder)


All files processed and saved in data_risk_added


# remember to copy and paste "data_risk_added" folder to the "RL_agent" folder!!!