# Data pre-processing for the Division of Linguistic Labor Experiment

### Creating DataFrame with players' performances during training and game rounds in both treatments

The goal here is to create the dataframes out of the experiment's raw data. In the first dataframe, each row represents the behavior of a player with respect to one of the five dogs presented during a round of a stage of the experiment, as well as the identifying characteristics of such a dog. 

The variables are the following:

1. treatment: solo or paired.
2. stage: experiment's stage (training or game).
3. dyad: dyad's name (str or NaN if treatment is solo).
4. player: player's id.
5. expert_in: refers to the kind of dog  (terriers or hounds) on which the player trained during training rounds.
6. novice_in: refers to the kind of dog  (terriers or hounds) on which the player did not train during the training rounds.
7. round: round of the respective stage.
8. object: refers to the particular dog, referred to by the image's file name.
9. position: refers to the frame (1,2,...,5) on which this dog's image was shown. 
10. kind: dog's kind.
11. classif: refers to the label the player used to classify the dog.
12. accuracy: whether classification was correct (0=incorrect/1=correct).
13. expert_dog: whether dog is from a breed on which subject received training (true, false or NaN if stage is training round).
14. queried: number of times the player asked partner about dog (NaN if no query was made or if treatment is solo).
15. answered: proportion of times query was answered by partner.
16. answer_correct: proportion of times the answer was correct.
17. Yes/No: proportion of 'Yes' answers by partner.


In [1]:
import pandas as pd
import numpy as np

In [2]:
data_single = pd.read_csv('paraK_single.csv')
data_single['treatment'] = 'solo'
data_single.head(2)

Unnamed: 0,Dyad,Player,Raza,Stage,Round,Object,Label,Kind,Correct,treatment
0,32519-32519,325192931513079,terrier,5,1,C11.jpg,C,C,1,solo
1,32519-32519,325192931513079,terrier,5,1,A1.jpg,C,A,0,solo


In [3]:
data_group = pd.read_csv('paraK_group.csv')
data_group['treatment'] = 'paired'
data_group.head(2)

Unnamed: 0,Dyad,Player,Raza,Stage,Round,Object,Label,Kind,Correct,treatment
0,45981-38174,459819203408978,hound,1,1,B5.jpg,B,B,1,paired
1,45981-38174,459819203408978,hound,1,1,D2.jpg,B,D,0,paired


In [4]:
data = pd.concat([data_single, data_group])
dict_training_game = {5: 'Training rounds', 1:'Training rounds', 6:'Game rounds', 2: 'Game rounds'}
data['Stage'] = data['Stage'].map(dict_training_game)
data['RazaN'] = data['Raza'].apply(lambda x: 'terrier' if x=='hound' else 'hound')
data['indice'] = data.index
data['position'] = data['indice'].apply(lambda x: 'frame' + str(x%5 +1))
del data['indice']
data = data[['treatment', 'Stage', 'Dyad', 'Player', 'Raza', 'RazaN', 'Round', 'Object', 'position', 'Kind', 'Label', 'Correct']]
data.columns = ['treatment', 'stage', 'dyad', 'player', 'expert_in', 'novice_in', 'round', 'object', 'position', 'kind', 'classif', 'accuracy']
data['expert_dog'] = data.apply(lambda x: True if (((x['expert_in']=='hound') and ((x['kind']=='B') or (x['kind']=='D'))) or 
                      ((x['expert_in']=='terrier') and ((x['kind']=='A') or (x['kind']=='C')))) else False, axis=1)
data.head(2)

Unnamed: 0,treatment,stage,dyad,player,expert_in,novice_in,round,object,position,kind,classif,accuracy,expert_dog
0,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,C11.jpg,frame1,C,C,1,True
1,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,A1.jpg,frame2,A,C,0,True


In [5]:
data_comunicacion = pd.read_csv('comunicacion-raw.csv')
data_comunicacion['etapa'] = 'Game rounds'
data_comunicacion = data_comunicacion[['Player', 'etapa', 'Round', 'Rotulo', 'Recibido', 'Correctitud', 'Perro']]
data_comunicacion.columns = ['player', 'stage', 'round', 'label', 'answer', 'answer_correct', 'position']
data_comunicacion['position'] = data_comunicacion['position'].apply(lambda x: 'frame' + str(x)[-1])
data_comunicacion['answered'] = [0 if x=='-' else 1 for x in data_comunicacion['answer']]
data_comunicacion['yes/no'] = [1 if x=='Si' else 0 for x in data_comunicacion['answer']]
data_comunicacion.head()


Unnamed: 0,player,stage,round,label,answer,answer_correct,position,answered,yes/no
0,38174543864515,Game rounds,2,D,Si,1.0,frame1,1,1
1,38174543864515,Game rounds,2,B,Si,1.0,frame5,1,1
2,459819203408978,Game rounds,2,C,Si,0.0,frame4,1,1
3,459819203408978,Game rounds,3,A,Si,1.0,frame1,1,1
4,459819203408978,Game rounds,3,C,Si,1.0,frame2,1,1


In [6]:
data_comunicacion = data_comunicacion.groupby(['player', 'round', 'position']).\
    agg({'label':'count', 'answer_correct':'mean', 'answered':'mean', 'yes/no':'mean'}).reset_index()
data_comunicacion['answer_correct'] = data_comunicacion.apply(lambda x: np.nan if x['answered']==0 else x['answer_correct'], axis=1)
data_comunicacion['stage'] = 'Game rounds'
data_comunicacion.columns = ['player', 'round', 'position', 'queried', 'answer_correct', 'answered', 'yes/no', 'stage']
data_comunicacion.head()


Unnamed: 0,player,round,position,queried,answer_correct,answered,yes/no,stage
0,21409433577947,1,frame2,1,,0.0,0.0,Game rounds
1,21409433577947,1,frame3,1,1.0,1.0,0.0,Game rounds
2,21409433577947,2,frame1,3,1.0,1.0,0.0,Game rounds
3,21409433577947,2,frame2,1,1.0,1.0,0.0,Game rounds
4,21409433577947,3,frame2,2,1.0,1.0,0.5,Game rounds


In [7]:
df1 = pd.merge(data, data_comunicacion, how='left', on=['player', 'stage', 'round', 'position'])
df1.head()

Unnamed: 0,treatment,stage,dyad,player,expert_in,novice_in,round,object,position,kind,classif,accuracy,expert_dog,queried,answer_correct,answered,yes/no
0,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,C11.jpg,frame1,C,C,1,True,,,,
1,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,A1.jpg,frame2,A,C,0,True,,,,
2,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,A7.jpg,frame3,A,A,1,True,,,,
3,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,A6.jpg,frame4,A,A,1,True,,,,
4,solo,Training rounds,32519-32519,325192931513079,terrier,hound,1,A5.jpg,frame5,A,A,1,True,,,,


In [8]:
df1.to_csv('performances.csv', index=False)

---

### Creating DataFrame from questionary on label understanding

The goal here is to process the tables that come from the experiment's raw data, in order to create a dataframe where each row represents the score on label understanding per player on each kind of dog, with the following columns:

1. treatment: single or dyad. 
* dyad: dyad's name.
* player: player's id.
* kind: the dog's kind.
* expertise: whether the player was trained on the dog or not (experts or novices).
* report: player reported score on label understanding.
* accuracy: mean accuracy over game rounds.
* queried: proportion of times the player queried their partner on this kind of dog.
* answered: proportion of times queries on this kind of dog were answered by partner.
* player_responded: proportion of times player responded to queries on this kind of dog.


In [9]:
reporte_comprension_individual = pd.read_csv('./calificacion_single.csv')
reporte_comprension_individual['treatment'] = 'solo'
reporte_comprension_individual.columns = ['player', 'expert_in', 'GradingA', 'GradingB', 'GradingC', 'GradingD', 'treatment']
reporte_comprension_individual.head()

Unnamed: 0,player,expert_in,GradingA,GradingB,GradingC,GradingD,treatment
0,325192931513079,terrier,4,4,4,4,solo
1,571663380480362,terrier,6,5,6,6,solo
2,573037277775110,hound,6,5,6,7,solo
3,540382156604799,terrier,6,4,6,4,solo
4,880772070057086,hound,5,6,5,6,solo


In [10]:
reporte_comprension_parejas = pd.read_csv('./calificacion_group.csv')
reporte_comprension_parejas['treatment'] = 'paired'
reporte_comprension_parejas.columns = ['player', 'expert_in', 'GradingA', 'GradingB', 'GradingC', 'GradingD', 'treatment']
reporte_comprension_parejas.head()

Unnamed: 0,player,expert_in,GradingA,GradingB,GradingC,GradingD,treatment
0,38174543864515,terrier,4,3,7,5,paired
1,459819203408978,hound,3,6,4,7,paired
2,137619579400459,terrier,7,4,7,4,paired
3,835804561913525,hound,4,7,4,7,paired
4,42404690668940,terrier,5,3,7,7,paired


In [11]:
reporte_comprension = pd.concat([reporte_comprension_parejas, reporte_comprension_individual])
reporte_comprension = reporte_comprension[['treatment', 'player', 'expert_in', 'GradingA', 'GradingB', 'GradingC', 'GradingD']]
reporte_comprension.head()

Unnamed: 0,treatment,player,expert_in,GradingA,GradingB,GradingC,GradingD
0,paired,38174543864515,terrier,4,3,7,5
1,paired,459819203408978,hound,3,6,4,7
2,paired,137619579400459,terrier,7,4,7,4
3,paired,835804561913525,hound,4,7,4,7
4,paired,42404690668940,terrier,5,3,7,7


In [12]:
df_1 = reporte_comprension[reporte_comprension['expert_in']=='terrier']
df_1 = pd.melt(df_1, ['player', 'treatment'], ['GradingB', 'GradingD'])
df_1['variable'] = df_1['variable'].apply(lambda x: x[-1])
df_1 = df_1[['treatment', 'player', 'value', 'variable']]
df_1.columns = ['treatment', 'player', 'report', 'kind']
df_2 = reporte_comprension[reporte_comprension['expert_in']=='hound']
df_2 = pd.melt(df_2, ['player', 'treatment'], ['GradingA', 'GradingC'])
df_2['variable'] = df_2['variable'].apply(lambda x: x[-1])
df_2 = df_2[['treatment', 'player', 'value', 'variable']]
df_2.columns = ['treatment', 'player', 'report', 'kind']
df_novatos = pd.concat([df_1, df_2])
df_novatos['expertise'] = 'novices'
df_novatos.head()

Unnamed: 0,treatment,player,report,kind,expertise
0,paired,38174543864515,3,B,novices
1,paired,137619579400459,4,B,novices
2,paired,42404690668940,3,B,novices
3,paired,951034904396380,4,B,novices
4,paired,362421570668147,4,B,novices


In [13]:
df_1 = reporte_comprension[reporte_comprension['expert_in']=='terrier']
df_1 = pd.melt(df_1, ['player', 'treatment'], ['GradingA', 'GradingC'])
df_1['variable'] = df_1['variable'].apply(lambda x: x[-1])
df_1 = df_1[['treatment', 'player', 'value', 'variable']]
df_1.columns = ['treatment', 'player', 'report', 'kind']
df_2 = reporte_comprension[reporte_comprension['expert_in']=='hound']
df_2 = pd.melt(df_2, ['player', 'treatment'], ['GradingB', 'GradingD'])
df_2['variable'] = df_2['variable'].apply(lambda x: x[-1])
df_2 = df_2[['treatment', 'player', 'value', 'variable']]
df_2.columns = ['treatment', 'player', 'report', 'kind']
df_expertos = pd.concat([df_1, df_2])
df_expertos['expertise'] = 'experts'
df_expertos.head()

Unnamed: 0,treatment,player,report,kind,expertise
0,paired,38174543864515,4,A,experts
1,paired,137619579400459,7,A,experts
2,paired,42404690668940,5,A,experts
3,paired,951034904396380,6,A,experts
4,paired,362421570668147,7,A,experts


In [14]:
reporte_comprension = pd.concat([df_expertos, df_novatos])
reporte_comprension.head()

Unnamed: 0,treatment,player,report,kind,expertise
0,paired,38174543864515,4,A,experts
1,paired,137619579400459,7,A,experts
2,paired,42404690668940,5,A,experts
3,paired,951034904396380,6,A,experts
4,paired,362421570668147,7,A,experts


In [15]:
df_comp = df1[df1['stage']=='Game rounds']
df_comp = df_comp[['treatment', 'player', 'kind', 'round', 'accuracy']]
df_comp = df_comp.groupby(['treatment', 'player', 'kind'])['accuracy'].mean().reset_index()
df_comp.head()

Unnamed: 0,treatment,player,kind,accuracy
0,paired,21409433577947,A,0.931034
1,paired,21409433577947,B,0.828571
2,paired,21409433577947,C,0.1
3,paired,21409433577947,D,0.571429
4,paired,32111755046981,A,0.678571


In [16]:
reporte_comprension = pd.merge(df_comp, reporte_comprension, on=['treatment', 'player', 'kind'], how = 'outer')
reporte_comprension = reporte_comprension[['treatment', 'player', 'kind', 'expertise', 'report', 'accuracy']]
reporte_comprension.tail()


Unnamed: 0,treatment,player,kind,expertise,report,accuracy
331,solo,948958320489333,D,novices,3.0,0.59375
332,solo,981987360384333,A,novices,6.0,0.904762
333,solo,981987360384333,B,experts,7.0,1.0
334,solo,981987360384333,C,novices,6.0,0.902439
335,solo,981987360384333,D,experts,7.0,1.0


In [17]:
df1['query'] = df1['queried'].apply(lambda x: 1 if x > 0 else 0)
df_comp = df1.groupby(['player', 'kind'])['query'].mean().reset_index(name='queried')
#df_comp = df_comp[df_comp['queried']>0]
#del df_comp['queried']
#df_comp.columns = ['player', 'kind', 'queried']
df_comp.tail()

Unnamed: 0,player,kind,queried
331,951034904396380,D,0.6
332,981987360384333,A,0.0
333,981987360384333,B,0.0
334,981987360384333,C,0.0
335,981987360384333,D,0.0


In [18]:
reporte_comprension = pd.merge(reporte_comprension, df_comp, on=['player', 'kind'], how='outer')
reporte_comprension.head()


Unnamed: 0,treatment,player,kind,expertise,report,accuracy,queried
0,paired,21409433577947,A,experts,7.0,0.931034,0.022472
1,paired,21409433577947,B,novices,7.0,0.828571,0.628571
2,paired,21409433577947,C,experts,1.0,0.1,0.011111
3,paired,21409433577947,D,novices,7.0,0.571429,0.666667
4,paired,32111755046981,A,novices,5.0,0.678571,0.107143


In [19]:
df_comp = df1.groupby(['player', 'kind'])['answered'].mean().reset_index()
#df_comp = df_comp[df_comp['answered']=='Yes']
#del df_comp['answered']
#df_comp.columns = ['player', 'kind', 'answered']
df_comp.head()

Unnamed: 0,player,kind,answered
0,21409433577947,A,0.75
1,21409433577947,B,0.954545
2,21409433577947,C,1.0
3,21409433577947,D,1.0
4,22670179051182,A,


In [20]:
reporte_comprension = pd.merge(reporte_comprension, df_comp, on=['player', 'kind'], how='outer')
reporte_comprension.head()


Unnamed: 0,treatment,player,kind,expertise,report,accuracy,queried,answered
0,paired,21409433577947,A,experts,7.0,0.931034,0.022472,0.75
1,paired,21409433577947,B,novices,7.0,0.828571,0.628571,0.954545
2,paired,21409433577947,C,experts,1.0,0.1,0.011111,1.0
3,paired,21409433577947,D,novices,7.0,0.571429,0.666667,1.0
4,paired,32111755046981,A,novices,5.0,0.678571,0.107143,0.666667


In [21]:
parejas_dict = {}
for pareja, grp in df1[df1['treatment']=='paired'].groupby('dyad'):
    jugadores = [int(x) for x in grp.player.unique()]
    parejas_dict[jugadores[0]] = jugadores[1]
    parejas_dict[jugadores[1]] = jugadores[0]

reporte_comprension_expertos = pd.DataFrame(reporte_comprension[reporte_comprension['treatment']=='paired'])
reporte_comprension_expertos['partner'] = reporte_comprension_expertos['player'].map(parejas_dict)
#reporte_comprension['partner'] = reporte_comprension['partner'].astype(int)
reporte_comprension_expertos['llave'] = reporte_comprension_expertos.apply(lambda x: (x['partner'], x['kind']), axis=1)
dict_respuesta = dict(zip(reporte_comprension_expertos['llave'], reporte_comprension_expertos['answered']))
del reporte_comprension_expertos['llave']
reporte_comprension_expertos['llave'] = reporte_comprension_expertos.apply(lambda x: (x['player'], x['kind']), axis=1)
reporte_comprension_expertos['player_responded'] = reporte_comprension_expertos['llave'].map(dict_respuesta)
del reporte_comprension_expertos['llave']
del reporte_comprension_expertos['partner']
reporte_comprension_expertos = reporte_comprension_expertos[['player', 'kind', 'player_responded']]
reporte_comprension_expertos.head()


Unnamed: 0,player,kind,player_responded
0,21409433577947,A,0.941176
1,21409433577947,B,
2,21409433577947,C,0.888889
3,21409433577947,D,
4,32111755046981,A,


In [22]:
reporte_comprension = pd.merge(reporte_comprension, reporte_comprension_expertos, on=['player', 'kind'], how='outer')
reporte_comprension.head()


Unnamed: 0,treatment,player,kind,expertise,report,accuracy,queried,answered,player_responded
0,paired,21409433577947,A,experts,7.0,0.931034,0.022472,0.75,0.941176
1,paired,21409433577947,B,novices,7.0,0.828571,0.628571,0.954545,
2,paired,21409433577947,C,experts,1.0,0.1,0.011111,1.0,0.888889
3,paired,21409433577947,D,novices,7.0,0.571429,0.666667,1.0,
4,paired,32111755046981,A,novices,5.0,0.678571,0.107143,0.666667,


In [23]:
reporte_comprension.to_csv('./rep-understanding.csv', index=False)