## Group participants into adaptive, maladaptive and others using kmeans


In [21]:
from sklearn.cluster import KMeans
import pandas as pd

from vars import clicked_pid

## Cluster them according to the bonus they received

In [30]:
# load data from data/human/strategy_discovery/participants.csv
df = pd.read_csv('../../data/human/strategy_discovery/participants.csv')

# filter for participants who clicked anything at all using the clicked_pid list
df = df[df.pid.isin(clicked_pid)]

# group participants according to the bonus they received
X = df[['bonus']].values

In [74]:
kmeans = KMeans(n_clusters=2, max_iter=100000).fit(X)
kmeans.labels_

# give the pid of the participants in each group
df['group'] = kmeans.labels_

# show complete list pid of each group without truncation
df.groupby('group').pid.apply(lambda x: x.tolist())[1]



  super()._check_params_vs_input(X, default_n_init=10)


[2,
 28,
 32,
 35,
 36,
 38,
 39,
 48,
 59,
 63,
 75,
 82,
 89,
 92,
 95,
 105,
 115,
 125,
 151,
 154,
 166,
 172,
 173,
 175,
 195,
 196,
 203,
 205,
 215,
 234,
 239,
 259,
 261,
 271,
 281,
 289,
 294,
 312,
 318,
 320,
 327,
 344,
 352,
 353,
 354,
 360,
 378]

In [72]:
# mean the mean of bonus for each group
df.groupby('group').bonus.mean()

group
0    0.065133
1    3.293191
Name: bonus, dtype: float64

## Cluster them according to their score

In [29]:
# group participants into adaptive, mod, maladaptive according to their score
# load data from data/human/strategy_discovery/mouselab-mdp.csv
df = pd.read_csv('../../data/human/strategy_discovery/mouselab-mdp.csv')

# filter for participants who clicked anything at all using the clicked_pid list
df = df[df.pid.isin(clicked_pid)]

## only use trials 60 - 120
#df = df[(df.trial_index > 60) & (df.trial_index <= 120)]

# for each unique pid, calculate their average score and create a new list with unique pid and their average score
df = df.groupby('pid').score.mean().reset_index()


# group participants according to the score they received
X = df[['score']].values

kmeans = KMeans(n_clusters=3, max_iter=100000).fit(X)
kmeans.labels_

# give the pid of the participants in each group
df['group'] = kmeans.labels_

# save as csv
# df.to_csv('score_grouped.csv', index=False)

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# mean the mean of score for each group
df.groupby('group').score.mean()

group
0   -20.454380
1     5.502627
2    -8.819320
Name: score, dtype: float64

In [31]:
# show all the unique pid in group 1
df.groupby('group').pid.apply(lambda x: x.tolist())[2]

[2,
 6,
 11,
 17,
 28,
 29,
 32,
 35,
 36,
 38,
 39,
 48,
 49,
 53,
 55,
 59,
 63,
 70,
 74,
 75,
 79,
 82,
 84,
 86,
 89,
 92,
 95,
 103,
 105,
 109,
 111,
 112,
 115,
 118,
 125,
 147,
 151,
 154,
 165,
 166,
 171,
 172,
 173,
 174,
 175,
 181,
 186,
 193,
 195,
 196,
 200,
 202,
 203,
 205,
 214,
 215,
 224,
 234,
 239,
 241,
 249,
 251,
 256,
 259,
 260,
 261,
 265,
 271,
 279,
 281,
 282,
 289,
 294,
 300,
 312,
 316,
 318,
 320,
 323,
 327,
 335,
 339,
 340,
 344,
 352,
 353,
 354,
 357,
 359,
 360,
 369,
 378]

In [33]:
len(df.groupby('group').pid.apply(lambda x: x.tolist())[2])

179

## How many people actually found the optimal strategy? 

In [4]:
df = pd.read_csv('../../data/human/strategy_discovery/mouselab-mdp.csv')
# filter for participants who clicked anything at all using the clicked_pid list
df = df[df.pid.isin(clicked_pid)]

# filter for participants whose score was 13, 14 or 15 at least once
df_optimal = df[(df.score == 13) | (df.score == 14) | (df.score == 15)]
print(len(df_optimal.pid.unique()))
print(df_optimal.pid.unique())

172
[  2   8  13  14  20  22  26  28  30  32  33  34  35  36  37  38  39  41
  42  48  49  53  55  56  57  58  59  61  62  63  66  69  70  72  74  75
  78  82  83  84  85  86  87  89  90  92  95 104 105 109 111 112 115 117
 118 119 122 125 128 130 136 137 140 141 147 151 152 153 154 155 162 165
 166 168 169 171 172 173 174 175 176 181 182 183 184 186 192 193 194 195
 196 200 202 203 205 209 210 212 213 215 216 220 223 224 225 234 239 240
 246 249 251 252 254 256 259 260 261 265 267 269 271 272 274 276 277 278
 279 281 282 285 286 289 290 291 292 293 294 303 304 311 312 314 315 316
 317 318 320 321 323 324 325 327 328 336 339 340 342 343 344 346 352 353
 354 357 358 359 360 369 371 373 374 378]


In [15]:
# filter for participants whose score was 13, 14 or 15 at least twice
df_optimal = df[(df.score == 13) | (df.score == 14) | (df.score == 15)]
df_optimal = df_optimal.groupby('pid').score.count().reset_index()
df_optimal = df_optimal[df_optimal.score >= 2]
print(len(df_optimal.pid.unique()))
print(df_optimal.pid.unique())

150
[  2   8  13  14  20  26  28  30  32  33  34  35  36  37  38  39  41  48
  49  53  55  56  57  59  61  62  63  70  72  74  75  78  82  83  84  85
  86  87  89  90  92  95 104 105 109 111 112 115 117 118 122 125 130 140
 141 147 151 152 154 162 165 166 168 169 171 172 173 175 176 181 182 183
 184 186 193 194 195 196 200 202 203 205 209 210 212 213 215 216 220 223
 224 225 234 239 240 246 249 251 252 254 256 259 260 261 265 267 269 271
 274 276 277 279 281 282 285 289 290 291 292 294 303 304 312 316 317 318
 320 321 323 325 327 328 336 339 340 342 343 344 352 353 354 357 358 359
 360 369 371 373 374 378]


In [16]:
from vars import mf_pid, hybrid_pid, habitual_pid

# check for overlap between df_optimal (who found the optimal strategy) and mb_pid 
overlap_mb = set(df_optimal.pid.unique()).intersection(mb_pid)
print("MB", len(overlap_mb))
print(len(overlap_mb) / len(df_optimal.pid.unique()))

# check for overlap between df_optimal and mf_pid
overlap_mf = set(df_optimal.pid.unique()).intersection(mf_pid)
print("MF", len(overlap_mf))
print(len(overlap_mf) / len(df_optimal.pid.unique()))

# check for overlap between df_optimal and hybrid_pid
overlap_hybrid = set(df_optimal.pid.unique()).intersection(hybrid_pid)
print("hybrid", len(overlap_hybrid))
print(len(overlap_hybrid) / len(df_optimal.pid.unique()))

# check for overlap between df_optimal and habitual_pid
overlap_habitual = set(df_optimal.pid.unique()).intersection(habitual_pid)
print("Habitual", len(overlap_habitual))
print(len(overlap_habitual) / len(df_optimal.pid.unique()))


MB 20
0.13333333333333333
MF 34
0.22666666666666666
hybrid 42
0.28
Habitual 54
0.36


In [17]:
# look at which trial the participant first achieved the optimal score

# calculate the average at which mb_pid first achieved the optimal score
df_mb = df[df.pid.isin(mb_pid)]
df_mb = df_mb[(df_mb.score == 13) | (df_mb.score == 14) | (df_mb.score == 15)]
df_mb = df_mb.groupby('pid').trial_index.min().reset_index()
df_mb = df_mb[df_mb.pid.isin(mb_pid)]
print(df_mb.trial_index.mean())

# calculate the average at which mf_pid first achieved the optimal score
df_mf = df[df.pid.isin(mf_pid)]
df_mf = df_mf[(df_mf.score == 13) | (df_mf.score == 14) | (df_mf.score == 15)]
df_mf = df_mf.groupby('pid').trial_index.min().reset_index()
df_mf = df_mf[df_mf.pid.isin(mf_pid)]
print(df_mf.trial_index.mean())

# calculate the average at which hybrid_pid first achieved the optimal score
df_hybrid = df[df.pid.isin(hybrid_pid)]
df_hybrid = df_hybrid[(df_hybrid.score == 13) | (df_hybrid.score == 14) | (df_hybrid.score == 15)]
df_hybrid = df_hybrid.groupby('pid').trial_index.min().reset_index()
df_hybrid = df_hybrid[df_hybrid.pid.isin(hybrid_pid)]
print(df_hybrid.trial_index.mean())

# calculate the average at which habitual_pid first achieved the optimal score
df_habitual = df[df.pid.isin(habitual_pid)]
df_habitual = df_habitual[(df_habitual.score == 13) | (df_habitual.score == 14) | (df_habitual.score == 15)]
df_habitual = df_habitual.groupby('pid').trial_index.min().reset_index()
df_habitual = df_habitual[df_habitual.pid.isin(habitual_pid)]
print(df_habitual.trial_index.mean())


53.80769230769231
29.54054054054054
31.20408163265306
28.45


## Divide the habitual participants into very adaptive, adaptive and maladaptive by using kmeans on the score


In [35]:
import matplotlib.pyplot as plt
habitual_pid = [1, 10, 11, 14, 20, 22, 25, 26, 27, 29, 33, 36, 37, 38, 39, 40, 46, 50, 51, 52, 55, 56, 59, 65, 70, 72,
                89, 90, 95, 98, 101, 111, 115, 118, 119, 125, 129, 134, 135, 140, 142, 148, 151, 152, 154, 162, 170,
                180, 186, 192, 193, 196, 202, 204, 205, 209, 210, 214, 215, 217, 234, 235, 237, 239, 240, 241, 249, 253,
                254, 256, 257, 265, 268, 271, 275, 276, 277, 278, 282, 289, 300, 304, 308, 311, 312, 313, 315, 321, 322,
                323, 329, 330, 331, 332, 336, 339, 343, 348, 354, 358, 363, 364, 370]

mfhybrid_pid = [2, 8, 24, 28, 43, 48, 49, 54, 62, 68, 73, 75, 77, 80, 85, 91, 93, 96, 99, 102, 107, 109, 110, 113, 116, 117,
          120, 122, 123, 124, 126, 131, 137, 145, 147, 149, 153, 156, 159, 166, 169, 171, 172, 178, 181, 183, 185, 187,
          190, 199, 200, 207, 212, 213, 220, 221, 226, 229, 233, 242, 244, 246, 247, 252, 261, 263, 266, 274, 279, 286,
          287, 294, 295, 296, 306, 309, 319, 333, 337, 340, 342, 352, 353, 365, 367, 369, 372, 376, 378, 3, 4, 6, 7, 9, 16, 17, 18, 19, 23, 30, 32, 34, 35, 41, 45, 53, 57, 58, 63, 67, 71, 76, 78, 82, 83, 86, 92,
              106, 128, 133, 138, 139, 141, 143, 146, 155, 161, 164, 165, 167, 173, 174, 175, 177, 184, 189, 194, 195,
              201, 203, 206, 211, 216, 218, 219, 223, 228, 231, 232, 236, 238, 250, 255, 259, 260, 262, 267, 280, 281,
              291, 292, 293, 299, 305, 310, 316, 317, 318, 320, 324, 327, 328, 341, 344, 346, 347, 349, 350, 355, 356,
              357, 359, 360, 361, 362, 371, 373, 374, 375, 377]

# for these participants, use kmeans to group them according to their score
df = pd.read_csv('../../data/human/strategy_discovery/mouselab-mdp.csv')
df = df[df.pid.isin(habitual_pid)]
df = df.groupby('pid').score.mean().reset_index()

# use kmeans to group the participants
X = df[['score']].values
kmeans = KMeans(n_clusters=3, max_iter=100000).fit(X)

# show the average score of each group
df['group'] = kmeans.labels_
df.groupby('group').score.mean()

# show the id of the participants in each group
df.groupby('group').pid.apply(lambda x: x.tolist())

# show the mean of the score of each group
print(df.groupby('group').score.mean())


group
0   -14.605891
1     4.343798
2   -50.625000
Name: score, dtype: float64


  super()._check_params_vs_input(X, default_n_init=10)


In [36]:
# show len of each group
print(len(df[df.group == 0]))
print(len(df[df.group == 1]))
print(len(df[df.group == 2]))

58
43
2


In [None]:
df_adaptive = df[df.group == 0].pid.tolist()
df_modadaptive = df[df.group == 1].pid.tolist()
df_maladaptive = df[df.group == 2].pid.tolist()

In [None]:

####
df = pd.read_csv('../../data/human/strategy_discovery/mouselab-mdp.csv')
df_adaptive_ = df[df.pid.isin(df_adaptive)]
df_modadaptive_ = df[df.pid.isin(df_modadaptive)]
df_maladaptive_ = df[df.pid.isin(df_maladaptive)]

df_adaptive_ = df_adaptive_.groupby('trial_index').score.mean().reset_index()
plt.plot(df_adaptive_['trial_index'], df_adaptive_['score'], label=f'Adaptive, n={len(df_adaptive)}', color='blue')

df_modadaptive_ = df_modadaptive_.groupby('trial_index').score.mean().reset_index()
plt.plot(df_modadaptive_['trial_index'], df_modadaptive_['score'], label=f'Mod adaptive, n={len(df_modadaptive)}', color='green')

# df_maladaptive_ = df_maladaptive_.groupby('trial_index').score.mean().reset_index()
# plt.plot(df_maladaptive_['trial_index'], df_maladaptive_['score'], label=f'Maladaptive, n={len(df_maladaptive)}', color='red')

plt.ylim(-80, 20)
plt.legend()
plt.show()
plt.close()


## Check how many participants who clicked everything in the first trial are best explained by the habitual model

In [9]:
from vars import examined_all_pid, habitual_pid

# check for overlap between those two lists
overlap = set(habitual_pid).intersection(examined_all_pid)
print(overlap)

{1, 129, 257, 135, 142, 20, 148, 22, 151, 276, 282, 50, 313, 331, 332, 210, 343, 88, 358, 234, 363, 111, 370}


In [5]:
# get list of people who are not in the habitual list
not_habitual = set(habitual_pid).difference(examined_all_pid)
print(not_habitual)

{256, 265, 10, 11, 12, 268, 14, 15, 271, 273, 275, 277, 278, 25, 26, 27, 29, 285, 288, 33, 289, 290, 36, 37, 38, 39, 40, 42, 298, 300, 46, 47, 304, 51, 52, 308, 55, 56, 311, 312, 59, 61, 315, 65, 321, 322, 323, 70, 326, 72, 329, 330, 336, 339, 89, 90, 348, 95, 351, 98, 354, 101, 103, 364, 115, 118, 119, 125, 130, 134, 140, 144, 152, 154, 162, 170, 180, 182, 186, 192, 193, 196, 202, 204, 205, 209, 214, 215, 217, 224, 235, 237, 239, 240, 241, 249, 253, 254}


In [1]:
clicked = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
               29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
               56, 57, 58, 59, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
               84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 98, 99, 101, 102, 103, 104, 105, 106, 107, 109, 110, 111,
               112, 113, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135,
               136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 159,
               160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 181,
               182, 183, 184, 185, 186, 187, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
               205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 228,
               229, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 246, 247, 248, 249, 250, 251, 252,
               253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
               275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295,
               296, 298, 299, 300, 301, 302, 303, 304, 305, 306, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318,
               319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 335, 336, 337, 339, 340, 341,
               342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362,
               363, 364, 365, 367, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379]
examined_all_pid = [1, 2, 20, 22, 24, 45, 50, 68, 80, 88, 111, 129, 135, 142, 148, 151, 164, 166, 172, 189, 210, 213,
                    228, 234, 257, 264, 276, 282, 313, 324, 331, 332, 338, 340, 343, 344, 358, 363, 370, 372]

# check for non-overlapping participants
non_overlap = set(clicked).difference(examined_all_pid)
print(non_overlap)

{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 95, 96, 98, 99, 101, 102, 103, 104, 105, 106, 107, 109, 110, 112, 113, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 130, 131, 133, 134, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 149, 152, 153, 154, 155, 156, 159, 160, 161, 162, 163, 165, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 211, 212, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 229, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241, 242, 244, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 258, 259, 260, 261, 262, 263, 265, 266, 