In this notebook, the data will be cleaned. First off, files with data from participants who did not complete the experiment will be discarded. Next, regret outliers will be detected and removed.

In [1]:
import pandas as pd
import os
import numpy as np
import scipy.stats
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [129]:
for i in range(200):
    if os.path.exists('All Data/' + str(i) + '.csv'):
        df = pd.read_csv('All Data/' + str(i) + '.csv')
        
        if df.shape[0] > 575:
            print(str(i) + '.csv, ' + str(df.shape[0]) + ' is okay. Retained!')
        else:
            os.remove('All Data/' + str(i) + '.csv')
            print(str(i) + '.csv, ' + str(df.shape[0]) + ' is not okay. Deleted!')

1.csv, 20 is not okay. Deleted!
2.csv, 3 is not okay. Deleted!
3.csv, 77 is not okay. Deleted!
4.csv, 325 is not okay. Deleted!
5.csv, 6 is not okay. Deleted!
6.csv, 1 is not okay. Deleted!
7.csv, 5 is not okay. Deleted!
8.csv, 1 is not okay. Deleted!
9.csv, 5 is not okay. Deleted!
10.csv, 5 is not okay. Deleted!
11.csv, 9 is not okay. Deleted!
12.csv, 31 is not okay. Deleted!
13.csv, 19 is not okay. Deleted!
14.csv, 19 is not okay. Deleted!
15.csv, 6 is not okay. Deleted!
16.csv, 114 is not okay. Deleted!
17.csv, 5 is not okay. Deleted!
18.csv, 14 is not okay. Deleted!
19.csv, 1 is not okay. Deleted!
20.csv, 8 is not okay. Deleted!
21.csv, 81 is not okay. Deleted!
22.csv, 4 is not okay. Deleted!
23.csv, 4 is not okay. Deleted!
24.csv, 4 is not okay. Deleted!
25.csv, 4 is not okay. Deleted!
26.csv, 1 is not okay. Deleted!
27.csv, 5 is not okay. Deleted!
28.csv, 5 is not okay. Deleted!
29.csv, 1 is not okay. Deleted!
30.csv, 5 is not okay. Deleted!
31.csv, 5 is not okay. Deleted!
32.csv

In [2]:
def makeRew(trend, stable, clip, numTrials, s):
    # generates the game and returns those where each bandit is the best at least once
    np.random.seed(s)
    
    # initializations
    mus = []
    rews = []
    mu = np.array([-60, -20, 20, 60])
    np.random.shuffle(mu)
    mus.append(mu)
    sigma1 = 4
    decay = 0.9836
    
    # trend
    kappas = np.zeros(4)
    if trend == 1:
        kappas[mu > 0] = -0.5
        kappas[mu < 0] = 0.5

    # stable/variable            
    sigma2 = 4        
    rews.append(np.random.normal(mu, sigma2))           
    for i in range(numTrials - 1):
        if stable == 0 and (51<=i<=100 or 151<=i<=200):
            sigma2 = 16
        else:
            sigma2 = 4
        mu = np.multiply(mu, decay)
        mu = np.add(mu, kappas)
        mu = np.random.normal(mu, sigma2)
        if clip == 1:
            mu = np.clip(mu, -100, 100)
        mus.append(mu)
        
        rew = np.random.normal(mu, sigma1)
        if clip == 1:
            rew = np.clip(rew, -100, 100)
        rews.append(rew)
    np.random.seed(None)
    maxs = np.argmax(rews, 1)
    if ((0 in maxs) and (1 in maxs) and (2 in maxs) and (3 in maxs)):
        return np.array(mus), np.array(rews)
    else:
        return None, None

def calc_regret(rews, chosen_idx):
    numTrials = rews.shape[0]
    maxs = np.max(rews, 1)
    reg = 0
    for i in range(numTrials):
        reg += maxs[i] - rews[i, int(chosen_idx[i])]
    return reg

In [3]:
numTrials = 200
m000, r000 = makeRew(0, 0, 0, numTrials, 207)
m001, r001 = makeRew(0, 0, 0, numTrials, 368)
m010, r010 = makeRew(0, 1, 0, numTrials, 82)
m011, r011 = makeRew(0, 1, 0, numTrials, 411)
m100, r100 = makeRew(1, 0, 0, numTrials, 49)
m101, r101 = makeRew(1, 0, 0, numTrials, 67)
m110, r110 = makeRew(1, 1, 0, numTrials, 75)
m111, r111 = makeRew(1, 1, 0, numTrials, 287)

In [4]:
final = pd.DataFrame()
for j in range(200):
    if os.path.exists('All Data/' + str(j) + '.csv'):
        df = pd.read_csv('All Data/' + str(j) + '.csv')
        
        trend = df['trend'][1]
        stable = df['stable'][1]
        num = df['num'][1]
        resp_map = eval(df['resp_map'][1])

        chosen_idx = df['response']
        chosen_idx = chosen_idx.iloc[6:df.shape[0]]
        chosen_idx = chosen_idx[[i for i in range(6, df.shape[0], 3)]]
        chosen_idx[chosen_idx == 'q'] = resp_map[0]
        chosen_idx[chosen_idx == 'p'] = resp_map[1]
        chosen_idx[chosen_idx == 'm'] = resp_map[2]
        chosen_idx[chosen_idx == 'z'] = resp_map[3]
        chosen_idx[pd.isna(chosen_idx)] = -1
        chosen_idx = np.array(chosen_idx)

        rews = eval('r' + str(trend) + str(stable) + str(num))
        rews = rews[0:chosen_idx.shape[0],:]
        rews = rews[chosen_idx != -1]
        chosen_idx = chosen_idx[chosen_idx != -1]
        reg = calc_regret(rews, chosen_idx)
        final.loc[j,0] = reg
        
print(final)

               0
32   2079.887920
41   1042.495617
51   1563.637235
52   3211.175139
57   1729.738612
63   3225.915294
73   2151.965688
76   1109.097514
81   3224.751451
85   4294.488237
86   3355.275764
90   2365.680261
91   2025.982496
98   1337.457840
99   2688.699061
125  1084.908766
139  4668.684941
140  6015.136316
143  1382.826646
148  2060.768602
149  2727.133466
150  1454.394712
152  2676.698832
156  1588.816799
159  2253.510856
161  3608.234056
163  3595.450897
169   911.770341
170  1302.086605
178  1282.843238


In [7]:
x = scipy.stats.zscore(final)
new_final = final[np.abs(x) < 3]

print(final.shape)
print(new_final.shape)

print(np.setdiff1d(final.index.values, new_final.index.values))

(30, 1)
(29, 1)
[140]


Out of the thirty people who completed the experient, one (140) was removed as the outlier, based on the regret that was calculated.