In [158]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.utils import resample
import ast
%matplotlib inline

### Load data (pre-processed in clean.ipynb) and define useful constants

In [171]:
# load data
# res = pd.read_csv("results/results_cleaned.csv")
res = pd.read_csv("results/results_cleaned_no_outliers.csv").sort_values('timestamp').reset_index()

# required fields
ids = ['timestamp','postId','workerId','mode']

# interactions
inter = ['timestamp','postId','workerId','mode','interaction','UrlChanges', 'highUrlChanges', 
       'lowUrlChanges', 'medUrlChanges', 'model', 'pop_time']

modes = ["List", "Category", "Pairwise"]

### First count adds/removes without considering swaps

In [172]:
inter_l = ['timestamp','workerId','mode','interaction','UrlChanges'] 
inter_c = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges', 'medUrlChanges']
inter_p = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges']

ints = {}
ints['l'] = res[inter_l][res['mode']=='List'].drop_duplicates(subset=['workerId','interaction','UrlChanges'])
ints['c'] = res[inter_c][res['mode']=='Category'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges', 'medUrlChanges'])
ints['p'] = res[inter_p][res['mode']=='Pairwise'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges'])
for i in ints:
    ints[i] = ints[i].groupby(['workerId']).apply(pd.DataFrame.sort_values, 'timestamp')
    ints[i] = ints[i].reset_index(drop=True)
    ints[i].dropna(subset =['interaction'])
    
adds = {}
adds['List'] = ints['l'][ints['l']['interaction'] == 'ADD'].groupby('workerId').count()['interaction']
adds['Category'] = ints['c'][ints['c']['interaction'].isin(['LOW ADD', 'HIGH ADD', 'MED ADD'])].groupby('workerId').count()['interaction']
adds['Pairwise'] = ints['p'][ints['p']['interaction'].isin(['LEFT ADD', 'RIGHT ADD'])].groupby('workerId').count()['interaction']

removes = {}
removes['List'] = ints['l'][ints['l']['interaction'] == 'REMOVE'].groupby('workerId').count()['interaction']
removes['Category'] = ints['c'][ints['c']['interaction'].isin(['LOW REMOVE', 'HIGH REMOVE', 'MED REMOVE'])].groupby('workerId').count()['interaction']
removes['Pairwise'] = ints['p'][ints['p']['interaction'].isin(['LEFT REMOVE', 'RIGHT REMOVE'])].groupby('workerId').count()['interaction']

ranks = {}
ranks['List'] = ints['l'][ints['l']['interaction'] == 'RANK'].groupby('workerId').count()['interaction']
ranks['Category'] = ints['c'][ints['c']['interaction'] == 'RANK'].groupby('workerId').count()['interaction']
ranks['Pairwise'] = ints['p'][ints['p']['interaction'] == 'RANK'].groupby('workerId').count()['interaction']

In [175]:
adds1 = adds

In [176]:
#write adds to file 
df0=pd.DataFrame()
df0['measure'] = adds['List'].values
df0['condition'] = 'List'
df1=pd.DataFrame()
df1['measure'] = adds['Category'].values
df1['condition'] = 'Category'
df2=pd.DataFrame()
df2['measure'] = adds['Pairwise'].values
df2['condition'] = 'Pairwise'
df = pd.concat([df0,df1,df2])
df[['condition','measure']]
df.to_csv("results/adds.csv", index=False)

#write removes to file
df0=pd.DataFrame()
df0['measure'] = removes['List'].values
df0['condition'] = 'List'
df1=pd.DataFrame()
df1['measure'] = removes['Category'].values
df1['condition'] = 'Category'
df2=pd.DataFrame()
df2['measure'] = removes['Pairwise'].values
df2['condition'] = 'Pairwise'
df = pd.concat([df0,df1,df2])
df[['condition','measure']]
df.to_csv("results/removes.csv", index=False)

In [177]:
#write ranks to file
df0=pd.DataFrame()
df0['measure'] = ranks['List'].values
df0['condition'] = 'List'
df1=pd.DataFrame()
df1['measure'] = ranks['Category'].values
df1['condition'] = 'Category'
df2=pd.DataFrame()
df2['measure'] = ranks['Pairwise'].values
df2['condition'] = 'Pairwise'
df = pd.concat([df0,df1,df2])
df[['condition','measure']]
df.to_csv("results/ranks.csv", index=False)

### Next tag swaps and count again
- swap list - remove from list and put back in list
- swap category - remove from any bucket and place in a different bucket
- swap pairwise - remove from any pair and put in a different pair, or move from high to low in same pair

In [178]:
# get all add/remove interactions for CATEGORICAL
inter_c = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges', 'medUrlChanges']
cat = res[inter_c][res['mode']=='Category'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges', 'medUrlChanges'])
cat =cat[cat['interaction'].isin(['LOW ADD', 'HIGH ADD', 'MED ADD','LOW REMOVE', 'HIGH REMOVE', 'MED REMOVE'])]
workers_c = cat['workerId'].unique()
print(len(workers_c), " workers")

# parse ids from url strings
for s in ['lowUrlChanges','medUrlChanges','highUrlChanges']:
    cat[s] = cat[s].apply(lambda x: np.array(ast.literal_eval(x)).astype(np.int))
print(cat.shape)
print(cat.dropna().shape)

45  workers
(3683, 7)
(3683, 7)


In [179]:
# Count swaps in CATEGORICAL mode
for w in workers_c:
#     print(w)
    c = cat[cat['workerId']==w]
    m = c[['highUrlChanges','medUrlChanges','lowUrlChanges','interaction']].transpose()
    diffs = pd.DataFrame()
#     print(m.columns.values)
    for i,s in enumerate(m.columns.values):
        if i<len(m.columns.values)-1:
            nxt = m.columns.values[i+1]
            row =[]
            for j in ['lowUrlChanges','medUrlChanges','highUrlChanges']:
                row.append(np.setxor1d(m[s][j], m[nxt][j]))
            row.append(m[nxt].loc['interaction'])
            diffs[s] = row
    to_swap = []
    to_drop = []
    for i,s in enumerate(diffs.columns.values[:-2]):
#       whenever an item is removed
        if diffs[s][3] in ['LOW REMOVE', 'HIGH REMOVE', 'MED REMOVE']:
#       check if it is added on the next interaction
            nxt = diffs.columns.values[i+1]
            for j in [0,1,2]:
                if not diffs[s][j].size ==0:
#                 get item that was removed
                    v = diffs[s][j][0]
#                   check if it is added to nxt in a different bucket  
                    for k in [0,1,2]:
                        if not j==k and not diffs[nxt][k].size==0:
                            y = diffs[nxt][k][0]
                            if y == v:
#                             indexes are off by one because we diff the cols
                                to_swap.append(diffs.columns.values[i+2])
                                to_drop.append(diffs.columns.values[i+1]) 
#   update results
    res.loc[to_swap,'interaction'] = 'SWAP'
    res.drop(to_drop, inplace=True)

In [180]:
# get all add/remove interactions for LIST
inter_l = ['timestamp','workerId','mode','interaction','UrlChanges'] 
lst = res[inter_l][res['mode']=='List'].drop_duplicates(subset=['workerId','interaction','UrlChanges'])
lst =lst[lst['interaction'].isin(['ADD', 'REMOVE'])]
workers_l = lst['workerId'].unique()
print(len(workers_l), " workers")

# parse ids from url strings
lst['UrlChanges'] = lst['UrlChanges'].apply(lambda x: np.array(ast.literal_eval(x)).astype(np.int))

print(lst.shape)
print(lst.dropna().shape)

48  workers
(1032, 5)
(1032, 5)


In [181]:
# count swaps in LIST mode
# workers_l = ['A1CY7IOJ9YH136']
for w in workers_l:
#     print(w)
    c = lst[lst['workerId']==w]
    m = c[['UrlChanges','interaction']].transpose()
    
    to_swap = []
    to_drop = []
    diffs = pd.DataFrame()

    for i,s in enumerate(m.columns.values):
        if i<len(m.columns.values)-1:
            nxt = m.columns.values[i+1]
            row =[]
            #       check if lengths are the same but order changed
            if not m[s]['UrlChanges'].size == 0 and m[s]['UrlChanges'].size == m[nxt]['UrlChanges'].size:
#                 print(s, "same size")
#                 print(m[s]['UrlChanges'])
#                 print(m[nxt]['UrlChanges'])
                to_swap.append(nxt)
            row.append(np.setxor1d(m[s]['UrlChanges'], m[nxt]['UrlChanges']))
            row.append(m[nxt].loc['interaction'])
            diffs[s] = row
# diffs
    for i,s in enumerate(diffs.columns.values[:-2]):
        nxt = diffs.columns.values[i+1]
#       whenever an item is removed
        if diffs[s][1] == 'REMOVE':
#       check if it is added on the next interaction
            if not diffs[s][0].size ==0:
#               get item that was removed
                v = diffs[s][0][0]
#               check if it is added to nxt in a different bucket  
                if not diffs[nxt][0].size==0:
                    y = diffs[nxt][0][0]
                    if y == v:
#                   indexes are off by one because we diff the cols
                        to_swap.append(diffs.columns.values[i+2])
                        to_drop.append(diffs.columns.values[i+1]) 
#   update results
#     print("swap", to_swap)
#     print("drop", to_drop)
    res.loc[to_swap,'interaction'] = 'SWAP'
    res.drop(to_drop, inplace=True)

In [182]:
# get all add/remove interactions for PAIRWISE
inter_p = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges']
pair = res[inter_p][res['mode']=='Pairwise'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges'])
pair=pair[pair['interaction'].isin(['LEFT ADD', 'RIGHT ADD','LEFT REMOVE', 'RIGHT REMOVE'])]
workers_p = pair['workerId'].unique()
print(len(workers_p), " workers")

# parse ids from url strings
for s in ['lowUrlChanges','highUrlChanges']:
    pair[s] = pair[s].apply(lambda x: np.array(ast.literal_eval(x)).astype(np.int))
print(pair.shape)
print(pair.dropna().shape)

50  workers
(717, 6)
(717, 6)


In [183]:
# Count swaps in PAIRWISE mode
# workers_p = ['A1GKD3NG1NNHRP']
for w in workers_p:
#     print(w)
    c = pair[pair['workerId']==w]
    m = c[['highUrlChanges','lowUrlChanges','interaction']].transpose()
    diffs = pd.DataFrame()
#     print(m.columns.values)
    for i,s in enumerate(m.columns.values):
        if i<len(m.columns.values)-1:
            nxt = m.columns.values[i+1]
            row =[]
            for j in ['lowUrlChanges','highUrlChanges']:
                row.append(np.setxor1d(m[s][j], m[nxt][j]))
            row.append(m[nxt].loc['interaction'])
            diffs[s] = row
    to_swap = []
    to_drop = []
    for i,s in enumerate(diffs.columns.values[:-2]):
#       whenever an item is removed
        if diffs[s][2] in ['LEFT REMOVE', 'RIGHT REMOVE']:
#       check if it is added on the next interaction
            nxt = diffs.columns.values[i+1]
            for j in [0,1]:
                if not diffs[s][j].size ==0:
#                 get item that was removed
                    v = diffs[s][j][0]
#                   check if it is added to nxt  
                    for k in [0,1]:
                        if not diffs[nxt][k].size==0:
                            y = diffs[nxt][k][0]
                            if y == v:
#                             indexes are off by one because we diff the cols
                                to_swap.append(diffs.columns.values[i+2])
                                to_drop.append(diffs.columns.values[i+1]) 
#   update results
#     print("swap", to_swap)
#     print("drop", to_drop)
    res.loc[to_swap,'interaction'] = 'SWAP'
    res.drop(to_drop, inplace=True)

In [184]:
inter_l = ['timestamp','workerId','mode','interaction','UrlChanges'] 
inter_c = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges', 'medUrlChanges']
inter_p = ['timestamp','workerId','mode','interaction','highUrlChanges', 'lowUrlChanges']

ints = {}
ints['l'] = res[inter_l][res['mode']=='List'].drop_duplicates(subset=['workerId','interaction','UrlChanges'])
ints['c'] = res[inter_c][res['mode']=='Category'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges', 'medUrlChanges'])
ints['p'] = res[inter_p][res['mode']=='Pairwise'].drop_duplicates(subset=['workerId','interaction','highUrlChanges', 
                                                                'lowUrlChanges'])
for i in ints:
    ints[i] = ints[i].groupby(['workerId']).apply(pd.DataFrame.sort_values, 'timestamp')
    ints[i] = ints[i].reset_index(drop=True)
    ints[i].dropna(subset =['interaction'])
    
adds = {}
adds['List'] = ints['l'][ints['l']['interaction'] == 'ADD'].groupby('workerId').count()['interaction']
adds['Category'] = ints['c'][ints['c']['interaction'].isin(['LOW ADD', 'HIGH ADD', 'MED ADD'])].groupby('workerId').count()['interaction']
adds['Pairwise'] = ints['p'][ints['p']['interaction'].isin(['LEFT ADD', 'RIGHT ADD'])].groupby('workerId').count()['interaction']

removes = {}
removes['List'] = ints['l'][ints['l']['interaction'] == 'REMOVE'].groupby('workerId').count()['interaction']
removes['Category'] = ints['c'][ints['c']['interaction'].isin(['LOW REMOVE', 'HIGH REMOVE', 'MED REMOVE'])].groupby('workerId').count()['interaction']
removes['Pairwise'] = ints['p'][ints['p']['interaction'].isin(['LEFT REMOVE', 'RIGHT REMOVE'])].groupby('workerId').count()['interaction']


swaps = {}
swaps['List'] = ints['l'][ints['l']['interaction'] == 'SWAP'].groupby('workerId').count()['interaction']
swaps['Category'] = ints['c'][ints['c']['interaction'] == 'SWAP'].groupby('workerId').count()['interaction']
swaps['Pairwise'] = ints['p'][ints['p']['interaction'] == 'SWAP'].groupby('workerId').count()['interaction']

In [187]:
df = pd.DataFrame()
df['ADD'] = adds1['Category']
df['ADD_NOSWAP'] = adds['Category']
df

Unnamed: 0_level_0,ADD,ADD_NOSWAP
workerId,Unnamed: 1_level_1,Unnamed: 2_level_1
A11BSFO4LMHPXQ,12,12
A126IMAJ4EEE7,26,25
A15SUPIZ05ZFCD,41,41
A17V85U8PXS4LJ,56,50
A196XR61DIW5GU,12,11
A19XQH5DG3UO0,3,3
A1FLEFIVFT809G,85,80
A1HG89IPHXW7LO,4,4
A1JGA15NKUP0BB,3,3
A1KM8AW99FFRFZ,9,7


In [188]:
#write adds to file 
df0=pd.DataFrame()
df0['measure'] = adds['List'].values
df0['condition'] = 'List'
df1=pd.DataFrame()
df1['measure'] = adds['Category'].values
df1['condition'] = 'Category'
df2=pd.DataFrame()
df2['measure'] = adds['Pairwise'].values
df2['condition'] = 'Pairwise'
df = pd.concat([df0,df1,df2])
df[['condition','measure']]
df.to_csv("results/adds_noswap.csv", index=False)

#write removes to file
df0=pd.DataFrame()
df0['measure'] = removes['List'].values
df0['condition'] = 'List'
df1=pd.DataFrame()
df1['measure'] = removes['Category'].values
df1['condition'] = 'Category'
df2=pd.DataFrame()
df2['measure'] = removes['Pairwise'].values
df2['condition'] = 'Pairwise'
df = pd.concat([df0,df1,df2])
df[['condition','measure']]
df.to_csv("results/removes_noswap.csv", index=False)