# Data pre-processing for the Division of Linguistic Labor Experiment

In [2]:
import pandas as pd

### Creating DataFrame with performance from training and game rounds for both treatments

The goal here is to process the tables that come from the experiment's raw data, in order to create a dataframe where each row represents a dog presented during a round per each player and each stage of the experiment, with the following columns:

1. treatment: single or dyad. 
* dyad: dyad's name.
* player: player's id.
* expert_in: refers to the player's expert-dogs (terriers or hounds), that is, the kinds of dogs on which the player trained during training rounds.
* novice_in: refers to the player's novice-dogs (terriers or hounds).
* stage: experiment's stage (training or game).
* round: round of the respective stage.
* kind: dog's kind.
* classif: refers to the label the player used to classify the dog.
* accuracy: whether classification was correct (0=incorrect/1=correct).
* query: whether the player asked partner about dog (NaN if no query was made or if player belongs to treatment==single).
* label: label used by player to query partner (NaN if player didn't query or treatment==single).
* answered: whether query was answered by partner (0=No/1=Yes).
* answer: partner's anwer ('Yes' or 'No').
* answer_correct: whether partner's answer was correct or not (0=incorrect/1=correct).


In [3]:
data_single = pd.read_csv('paraK_single.csv')
data_single['treatment'] = 'single'
data_single.head(2)

Unnamed: 0,Dyad,Player,Raza,Stage,Round,Object,Label,Kind,Correct,treatment
0,32519-32519,325192931513079,terrier,5,1,C11.jpg,C,C,1,single
1,32519-32519,325192931513079,terrier,5,1,A1.jpg,C,A,0,single


In [4]:
data_group = pd.read_csv('paraK_group.csv')
data_group['treatment'] = 'Dyad'
data_group.head(2)

Unnamed: 0,Dyad,Player,Raza,Stage,Round,Object,Label,Kind,Correct,treatment
0,45981-38174,459819203408978,hound,1,1,B5.jpg,B,B,1,Dyad
1,45981-38174,459819203408978,hound,1,1,D2.jpg,B,D,0,Dyad


In [6]:
data = pd.concat([data_single, data_group])
dict_training_game = {5: 'Training rounds', 1:'Training rounds', 6:'Game rounds', 2: 'Game rounds'}
data['Stage'] = data['Stage'].map(dict_training_game)
del data['Object']
data['RazaN'] = data['Raza'].apply(lambda x: 'terrier' if x=='hound' else 'hound')
data['indice'] = data.index
data['Perro'] = data['indice'].apply(lambda x: 'Perro' + str(x%5 +1))
del data['indice']
data = data[['treatment', 'Dyad', 'Player', 'Raza', 'RazaN', 'Stage', 'Round', 'Label', 'Kind', 'Correct', 'Perro']]
data.columns = ['treatment', 'dyad', 'player', 'expert_in', 'novice_in', 'stage', 'round', 'kind', 
                'classif', 'accuracy', 'perro']
data.head(2)

Unnamed: 0,treatment,dyad,player,expert_in,novice_in,stage,round,kind,classif,accuracy,perro
0,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,C,C,1,Perro1
1,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,C,A,0,Perro2


In [7]:
data_comunicacion = pd.read_csv('comunicacion-raw.csv')
data_comunicacion['etapa'] = 'Game rounds'
data_comunicacion = data_comunicacion[['Player', 'etapa', 'Round', 'Rotulo', 'Recibido', 'Correctitud', 'Perro']]
data_comunicacion.columns = ['player', 'stage', 'round', 'label', 'answer', 'answer_correct', 'perro']
data_comunicacion.head()


Unnamed: 0,player,stage,round,label,answer,answer_correct,perro
0,38174543864515,Game rounds,2,D,Si,1.0,Perro1
1,38174543864515,Game rounds,2,B,Si,1.0,Perro5
2,459819203408978,Game rounds,2,C,Si,0.0,Perro4
3,459819203408978,Game rounds,3,A,Si,1.0,Perro1
4,459819203408978,Game rounds,3,C,Si,1.0,Perro2


In [8]:
df1 = pd.merge(data, data_comunicacion, how='left', on=['player', 'stage', 'round', 'perro'])
df1['query'] = df1['label'].apply(lambda x: 'Si' if (x=='A') or (x=='B') or (x=='C') or (x=='D') else 'No')
df1['answered'] = df1['answer'].apply(lambda x: 'Si' if (x=='Si') or (x=='No') else 'No')
df1 = df1[[
    'treatment', 'dyad', 'player', 'expert_in', 'novice_in', 'stage', 'round',
    'kind', 'classif', 'accuracy', 'query', 'label', 'answered', 'answer', 'answer_correct'
]]
df1.head()

Unnamed: 0,treatment,dyad,player,expert_in,novice_in,stage,round,kind,classif,accuracy,query,label,answered,answer,answer_correct
0,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,C,C,1,No,,No,,
1,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,C,A,0,No,,No,,
2,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,A,A,1,No,,No,,
3,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,A,A,1,No,,No,,
4,single,32519-32519,325192931513079,terrier,hound,Training rounds,1,A,A,1,No,,No,,


In [32]:
data.to_csv('performance.csv', index=False)

### Creating DataFrame from questionary on label understanding

The goal here is to process the tables that come from the experiment's raw data, in order to create a dataframe where each row represents the score on label understanding per player on each kind of dog, with the following columns:

1. treatment: single or dyad. 
* dyad: dyad's name.
* player: player's id.
* expert_in: refers to the player's expert-dogs (terriers or hounds), that is, the kinds of dogs on which the player trained during training rounds.
* novice_in: refers to the player's novice-dogs (terriers or hounds).
*  


In [33]:
reporte_comprension_individual = pd.read_csv('./calificacion_single.csv')
reporte_comprension_individual['Treatment'] = 'Single'
reporte_comprension_individual.columns = ['jugador', 'experto_en', 'GradingA', 'GradingB', 'GradingC', 'GradingD', 'Treatment']
reporte_comprension_individual.head()

Unnamed: 0,jugador,experto_en,GradingA,GradingB,GradingC,GradingD,Treatment
0,325192931513079,terrier,4,4,4,4,Single
1,571663380480362,terrier,6,5,6,6,Single
2,573037277775110,hound,6,5,6,7,Single
3,540382156604799,terrier,6,4,6,4,Single
4,880772070057086,hound,5,6,5,6,Single


In [34]:
reporte_comprension_parejas = pd.read_csv('./calificacion_group.csv')
reporte_comprension_parejas['Treatment'] = 'Dyad'
reporte_comprension_parejas.columns = ['jugador', 'experto_en', 'GradingA', 'GradingB', 'GradingC', 'GradingD', 'Treatment']
reporte_comprension_parejas.head()

Unnamed: 0,jugador,experto_en,GradingA,GradingB,GradingC,GradingD,Treatment
0,38174543864515,terrier,4,3,7,5,Dyad
1,459819203408978,hound,3,6,4,7,Dyad
2,137619579400459,terrier,7,4,7,4,Dyad
3,835804561913525,hound,4,7,4,7,Dyad
4,42404690668940,terrier,5,3,7,7,Dyad


In [24]:
reporte_comprension = pd.concat([reporte_comprension_parejas, reporte_comprension_individual])
reporte_comprension.head()

Unnamed: 0,jugador,experto_en,GradingA,GradingB,GradingC,GradingD,Treatment
0,38174543864515,terrier,4,3,7,5,Dyad
1,459819203408978,hound,3,6,4,7,Dyad
2,137619579400459,terrier,7,4,7,4,Dyad
3,835804561913525,hound,4,7,4,7,Dyad
4,42404690668940,terrier,5,3,7,7,Dyad


In [25]:
reporte_comprension.to_csv('./rep-comprension.csv', index=False)