# Anonymization of user data

Script that takes as in put the raw data from AMT and anonymizes the data:
Replaces the worker ID and removes 'assignment_id', 'hit_id', and 'qualification_score'.
This script is for internal use and will not be shared.


## Load packages

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

# Set Jupyter and Pandas to show 3 decimal places, does not work for lists of numbers
%precision 3
pd.options.display.float_format = '{:,.3f}'.format
np.set_printoptions(precision=3)

## File names

In [None]:
originalfilename = 'data/users-table-day2.csv'                                  # original data: will not be shared
newfilenameanonymized = 'data/users-table-day2-anonymized.csv'                  # anonymized data: will be used later
internalencodingfilename = 'data/users-table-day2-original-worker_ids.csv'      # for internal use: store the transformation from anonymized to original worker_ids

## Creating two new files

Loading the original data, anonymizing it, saving it and also the worker-id encoding


In [None]:
df = pd.read_csv(originalfilename)
print(f'Original data: {originalfilename}')
display(df)

# --- Anonymize the workers with randomized categories (https://pandas.pydata.org/docs/user_guide/categorical.html#working-with-categories)
c = df.worker_id.astype('category')
d = dict(enumerate(c.cat.categories))

from random import shuffle
keys = list(d.keys())
shuffle(keys)
d = dict(zip(d.values(), keys))             # randomize the category assignment s.t. categorical numbers are identical with alphabetical order

with open(internalencodingfilename, 'w') as f:      # save the category encoding
    for key in d.keys():
        f.write("%s,%s\n"%(key,d[key]))

df["worker_id"] = df["worker_id"].map(d)    # replace worker_ids with randomized categories
# df.drop(['assignment_id', 'hit_id', 'qualification_score'], axis=1, inplace=True)     # keep original schema
df.loc[df['assignment_id'] != '', 'assignment_id'] = ''
df.loc[df['hit_id'] != '', 'hit_id'] = ''

# --- store anonymized data and print
df.to_csv(newfilenameanonymized, index=False)
print(f'New data: {newfilenameanonymized}')
display(df)

# Evaluate random success rate

In [None]:
# What is the probability that a user randomly guessing answers k/n questions correctly? Binomial distribution
from scipy.stats import binom

# 50% correct 16/32
k=16
n=32
p=1/4
print(1-binom.cdf(k-1, n, p))

# 66.6% correct 24/32
k=24
n=32
p=1/4
print(1-binom.cdf(k-1, n, p))

In [None]:
# What is the probability that among n participants, there are two who have the same treatment?
n = 50
p = 1 - np.exp(-n*n/(2*2520*2520))
print(p*5000)