# Anonymization of user data

Script that takes as in put the raw data from AMT and anonymizes the data:
Replaces worker-id, removes 'assignment_id', 'hit_id', 'qualification_score'.
This script is for internal use and will not be shared.


## Load packages

In [17]:
import pandas as pd
import numpy as np
from IPython.display import display

# Set Jupyter and Pandas to show 3 decimal places, does not work for lists of numbers
%precision 3
pd.options.display.float_format = '{:,.3f}'.format
np.set_printoptions(precision=3)

## File names

In [18]:
originalfilename = 'data/users-table-day2.csv'                                  # original data: will not be shared
newfilenameanonymized = 'data/users-table-day2-anonymized.csv'                  # anonymized data: will be used later
internalencodingfilename = 'data/users-table-day2-original-worker_ids.csv'      # for internal use: store the transformation from anonymized to original worker_ids

## Creating two new files

Loading the original data, anonymizing it, saving it and also the worker-id encoding


In [19]:
df = pd.read_csv(originalfilename)
print(f'Original data: {originalfilename}')
display(df)

# --- Anonymize the workers with randomized categories (https://pandas.pydata.org/docs/user_guide/categorical.html#working-with-categories)
c = df.worker_id.astype('category')
d = dict(enumerate(c.cat.categories))

from random import shuffle
keys = list(d.keys())
shuffle(keys)
d = dict(zip(d.values(), keys))             # randomize the category assignment s.t. categorical numbers are identical with alphabetical order

with open(internalencodingfilename, 'w') as f:      # save the category encoding
    for key in d.keys():
        f.write("%s,%s\n"%(key,d[key]))

df["worker_id"] = df["worker_id"].map(d)    # replace worker_ids with randomized categories
df.drop(['assignment_id', 'hit_id', 'qualification_score'], axis=1, inplace=True)

# --- store anonymized data and print
df.to_csv(newfilenameanonymized, index=False)
print(f'New data: {newfilenameanonymized}')
display(df)

Original data: data/users-table-day2.csv


Unnamed: 0,worker_id,assignment_id,hit_id,qualification_score,current_section,current_page,sequence_num,pattern_order,start_datetime,end_datetime,...,q30_time,q31,q31_start,q31_end,q31_time,q32,q32_start,q32_end,q32_time,feedback
0,A10HW8JXM17XLD,32SVAV9L3HA6N0DHMWHWFNBTG0OA3M,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,1.000,"[1, 1, 2, 1, 1, 2, 3, 4, 2, 4, 4, 2, 4, 3, 3, ...",2023-07-06 15:33:53.819738,2023-07-06 15:56:48.16091,...,13226.000,2.000,2023-07-06 15:54:44.186955,2023-07-06 15:54:57.575035,13388.000,4.000,2023-07-06 15:55:00.530759,2023-07-06 15:55:13.292598,12761.000,I found the tutorial to be very helpful in und...
1,A117C4QPUJ39TH,317HQ483I9TSXMPPG3TO5ZXIYPJNII,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,1.000,"[3, 1, 3, 3, 2, 2, 4, 4, 2, 1, 1, 2, 1, 3, 4, ...",2023-07-06 15:35:09.753802,2023-07-06 15:47:48.277039,...,19847.000,4.000,2023-07-06 15:47:19.25664,2023-07-06 15:47:24.331627,5074.000,1.000,2023-07-06 15:47:26.069748,2023-07-06 15:47:33.66475,7595.000,good
2,A11MDNSWQJLCM6,3VE8AYVF8OYNKO0V9TYKK5UVTD2F8R,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,QUESTIONS,29,1.000,"[1, 3, 4, 2, 2, 4, 1, 1, 3, 2, 3, 1, 2, 3, 4, ...",2023-07-07 18:11:08.79005,,...,,,,,,,,,,
3,A11V9IHTXCR3CX,336KAV9KYSTIY0F1Y2H7ZZO8DO02Y8,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,1.000,"[1, 3, 3, 2, 3, 2, 2, 4, 2, 4, 4, 3, 4, 1, 1, ...",2023-07-07 18:11:52.077358,2023-07-07 18:37:30.385374,...,16843.000,3.000,2023-07-07 18:36:52.45584,2023-07-07 18:36:57.673103,5217.000,1.000,2023-07-07 18:37:00.669486,2023-07-07 18:37:12.637864,11968.000,Good
4,A11W7R2O4RQSSS,3UXUOQ9OKGYOOJE4O8KMS4N9L2AA7R,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,INSTRUCTIONS,1,,,2023-07-06 15:33:56.35275,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,AX3KFUB3F8VOU,3A4TN5196MJ9ANHELV2PPC2RZ8RCHX,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,0.000,"[4, 4, 3, 3, 4, 4, 3, 2, 1, 1, 2, 2, 1, 1, 2, ...",2023-07-06 16:05:50.918429,2023-07-06 16:18:44.773885,...,5546.000,2.000,2023-07-06 16:18:14.652581,2023-07-06 16:18:25.846863,11194.000,4.000,2023-07-06 16:18:27.91728,2023-07-06 16:18:32.772318,4855.000,none
167,AXAO7UJYYEFCO,3DR23U6WE7FECUSNHGP9TJ5IVEZET2,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,INSTRUCTIONS,1,,,2023-07-06 15:33:53.496439,,...,,,,,,,,,,
168,AXOE3WA9YT5MF,3FUI0JHJPZZ63FEWDH2MSFXNA1533E,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,0.000,"[3, 3, 3, 4, 2, 2, 4, 3, 1, 4, 4, 1, 2, 1, 1, ...",2023-07-07 19:01:38.084371,2023-07-07 19:19:47.324967,...,7472.000,3.000,2023-07-07 19:19:10.726193,2023-07-07 19:19:32.963664,22237.000,1.000,2023-07-07 19:19:34.721309,2023-07-07 19:19:39.613308,4891.000,good
169,AZ69TBTDH7AZS,3DL65MZB8FGXQDQCVUUJ495E7SZCEH,3KVQ0UJWPZM3M85UBIWFNPA9NMR5WA,,RESULTS,1,1.000,"[4, 3, 1, 4, 1, 4, 2, 2, 4, 3, 3, 2, 2, 1, 3, ...",2023-07-06 15:34:02.551439,2023-07-06 16:19:00.534461,...,12782.000,4.000,2023-07-06 16:18:21.318188,2023-07-06 16:18:31.224703,9906.000,2.000,2023-07-06 16:18:31.963803,2023-07-06 16:18:39.588003,7624.000,


New data: data/users-table-day2-anonymized.csv


Unnamed: 0,worker_id,current_section,current_page,sequence_num,pattern_order,start_datetime,end_datetime,tutorial_time,total_time_on_questions_and_answers,total_question_time,...,q30_time,q31,q31_start,q31_end,q31_time,q32,q32_start,q32_end,q32_time,feedback
0,84,RESULTS,1,1.000,"[1, 1, 2, 1, 1, 2, 3, 4, 2, 4, 4, 2, 4, 3, 3, ...",2023-07-06 15:33:53.819738,2023-07-06 15:56:48.16091,17.000,951.000,852.000,...,13226.000,2.000,2023-07-06 15:54:44.186955,2023-07-06 15:54:57.575035,13388.000,4.000,2023-07-06 15:55:00.530759,2023-07-06 15:55:13.292598,12761.000,I found the tutorial to be very helpful in und...
1,69,RESULTS,1,1.000,"[3, 1, 3, 3, 2, 2, 4, 4, 2, 1, 1, 2, 1, 3, 4, ...",2023-07-06 15:35:09.753802,2023-07-06 15:47:48.277039,6.000,594.000,442.000,...,19847.000,4.000,2023-07-06 15:47:19.25664,2023-07-06 15:47:24.331627,5074.000,1.000,2023-07-06 15:47:26.069748,2023-07-06 15:47:33.66475,7595.000,good
2,122,QUESTIONS,29,1.000,"[1, 3, 4, 2, 2, 4, 1, 1, 3, 2, 3, 1, 2, 3, 4, ...",2023-07-07 18:11:08.79005,,2.000,,,...,,,,,,,,,,
3,71,RESULTS,1,1.000,"[1, 3, 3, 2, 3, 2, 2, 4, 2, 4, 4, 3, 4, 1, 1, ...",2023-07-07 18:11:52.077358,2023-07-07 18:37:30.385374,42.000,1019.000,825.000,...,16843.000,3.000,2023-07-07 18:36:52.45584,2023-07-07 18:36:57.673103,5217.000,1.000,2023-07-07 18:37:00.669486,2023-07-07 18:37:12.637864,11968.000,Good
4,104,INSTRUCTIONS,1,,,2023-07-06 15:33:56.35275,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,109,RESULTS,1,0.000,"[4, 4, 3, 3, 4, 4, 3, 2, 1, 1, 2, 2, 1, 1, 2, ...",2023-07-06 16:05:50.918429,2023-07-06 16:18:44.773885,9.000,605.000,483.000,...,5546.000,2.000,2023-07-06 16:18:14.652581,2023-07-06 16:18:25.846863,11194.000,4.000,2023-07-06 16:18:27.91728,2023-07-06 16:18:32.772318,4855.000,none
167,27,INSTRUCTIONS,1,,,2023-07-06 15:33:53.496439,,,,,...,,,,,,,,,,
168,64,RESULTS,1,0.000,"[3, 3, 3, 4, 2, 2, 4, 3, 1, 4, 4, 1, 2, 1, 1, ...",2023-07-07 19:01:38.084371,2023-07-07 19:19:47.324967,77.000,562.000,496.000,...,7472.000,3.000,2023-07-07 19:19:10.726193,2023-07-07 19:19:32.963664,22237.000,1.000,2023-07-07 19:19:34.721309,2023-07-07 19:19:39.613308,4891.000,good
169,86,RESULTS,1,1.000,"[4, 3, 1, 4, 1, 4, 2, 2, 4, 3, 3, 2, 2, 1, 3, ...",2023-07-06 15:34:02.551439,2023-07-06 16:19:00.534461,32.000,915.000,857.000,...,12782.000,4.000,2023-07-06 16:18:21.318188,2023-07-06 16:18:31.224703,9906.000,2.000,2023-07-06 16:18:31.963803,2023-07-06 16:18:39.588003,7624.000,


# Evaluate random success rate

In [20]:
# What is the probability that a user randomly guessing answers k/n questions correctly? Binomial distribution
from scipy.stats import binom

# 50% correct 16/32
k=16
n=32
p=1/4
print(1-binom.cdf(k-1, n, p))

# 66.6% correct 24/32
k=24
n=32
p=1/4
print(1-binom.cdf(k-1, n, p))

0.0020029562073948792
4.178747903260671e-09


In [27]:
# What is the probability that among n participants, there are two who have the same treatment?
n = 50
p = 1 - np.exp(-n*n/(2*2520*2520))
print(p*5000)

0.9840931156501753
