# Data Analysis of Division of Linguistic Labor Experiment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.formula.api import ols
from scipy import stats

In [2]:
data = pd.read_csv('performance.csv')
data.head()

Unnamed: 0,pareja,jugador,experto_en,novato_en,etapa,ronda,perro,clasificacion,raza,clasificacion_correcta,tratamiento
0,32519-32519,325192931513079,terrier,hound,Training rounds,1,Perro1,C,C,1,Single
1,32519-32519,325192931513079,terrier,hound,Training rounds,1,Perro2,C,A,0,Single
2,32519-32519,325192931513079,terrier,hound,Training rounds,1,Perro3,A,A,1,Single
3,32519-32519,325192931513079,terrier,hound,Training rounds,1,Perro4,A,A,1,Single
4,32519-32519,325192931513079,terrier,hound,Training rounds,1,Perro5,A,A,1,Single


In [3]:
print(data[data['Treatment']=='Single']['Player'].unique().shape)
print(data[data['Treatment']=='Dyad']['Player'].unique().shape)

KeyError: 'Treatment'

### Data frames

In [None]:
data.head()
data_group = data[(data['Treatment']=='Dyad') & (data['Stage']=='Game rounds')]
data_group = data_group[['Dyad', 'Expert', 'Novice', 'Player', 'Round', 'Perro', 'Kind', 'Label', 'Correct']]
data_group.head()

In [None]:
data_comunicacion = pd.read_csv('comunicacion.csv')
data_comunicacion = data_comunicacion[['Dyad', 'Player', 'Round', 'Perro', 'Kind', 'suposicion', 'Rotulo', 'Recibido', 'Correctitud']]
data_comunicacion.head()

In [None]:
df1 = pd.merge(data_group, data_comunicacion, how='left', on=['Player', 'Round', 'Perro'])
df1 = df1[['Dyad_x', 'Expert', 'Novice', 'Player', 'Round', 'Perro', 'Kind_x', 'Label', 'Correct', 'suposicion', 'Rotulo', 'Recibido', 'Correctitud']]
df1['respondido'] = df1['Recibido'].apply(lambda x: 'Si' if (x=='Si') or (x=='No') else 'No')
df1.columns = ['pareja', 'experto_en', 'novato_en', 'jugador', 'ronda', 'perro', 'raza', 'clasificacion', 'clasificacion_correcta', 'suposicion', 'rotulo_enviado', 'respuesta', 'respuesta_correcta', 'respondido']
df1.head()


In [None]:
dict_razas = {'A':'Cairn\n Terrier', 'C':'Norwich\n Terrier', 'B':'Irish\n Wolfhound', 'D':'Scottish\n Deerhound'}
dict_orden = {'A':1, 'C':2, 'B':3, 'D':4}
df = df1[df1['rotulo_enviado'].isnull()==False]
df = df.groupby('raza')['respondido'].value_counts(normalize=True).reset_index(name='cuenta')
df = df[df['respondido'] == 'Si']
df['indice'] = df['raza']
df['raza'] = df['indice'].map(dict_razas)
df['indice'] = df['indice'].map(dict_orden)
df.sort_values(by='indice', inplace=True)
del df['respondido']
del df['indice']
df.columns = ['raza', 'respondido']
df_respondido = df
df_respondido

In [None]:
dict_razas = {'A':'Cairn\n Terrier', 'C':'Norwich\n Terrier', 'B':'Irish\n Wolfhound', 'D':'Scottish\n Deerhound'}
dict_orden = {'A':1, 'C':2, 'B':3, 'D':4}
df = df1[df1['rotulo_enviado'].isnull()==False]
df = df.groupby('raza')['respuesta_correcta'].value_counts(dropna=True, normalize=True).reset_index(name='cuenta')
df = df[df['respuesta_correcta'] == 1]
df['indice'] = df['raza']
df['raza'] = df['indice'].map(dict_razas)
df['indice'] = df['indice'].map(dict_orden)
del df['respuesta_correcta']
del df['indice']
df.columns = ['raza', 'respuesta_correcta']
df_correcto = df
df_correcto

In [None]:
df_accu = data.copy()
df_accu = df_accu[df_accu['Stage'] == 'Game rounds']
del df_accu['Raza']
del df_accu['Stage']
del df_accu['Perro']
del df_accu['Object']
df_accu.columns = ['pareja', 'jugador', 'ronda', 'clasificado_como', 'raza', 'clasificacion_correcta', 'Treatment', 'experto_en', 'novato_en']
df_accu['tipo_ejemplar'] = df_accu['raza'].apply(lambda x: 'terrier' if (x=='A' or x=='C') else 'hound')
dfN = df_accu[(df_accu['novato_en'] == df_accu['tipo_ejemplar'])]
#dfN = df_accu_novatos.groupby(['Treatment', 'ronda'])['clasificacion_correcta'].mean().reset_index(name='Accuracy')
dfE = df_accu[(df_accu['experto_en'] == df_accu['tipo_ejemplar'])]
#dfE = df_accu_expertos.groupby(['Treatment', 'ronda'])['clasificacion_correcta'].mean().reset_index(name='Accuracy')
dfN.head()

----

In [None]:
reporte_comprension = pd.read_csv('./rep-comprension.csv')
reporte_comprension.head()

In [None]:
df_1 = reporte_comprension[reporte_comprension['experto_en']=='terrier']
df_1 = df_1[['jugador', 'Treatment', 'experto_en', 'GradingB', 'GradingD']]
df_1 = pd.melt(df_1, ['jugador', 'Treatment'], ['GradingB', 'GradingD'])
df_1['variable'] = df_1['variable'].apply(lambda x: x[-1])
df_1 = df_1[['Treatment', 'jugador', 'value', 'variable']]
df_1.columns = ['Treatment', 'jugador', 'Report', 'raza']
df_1.head()

In [None]:
df_1 = reporte_comprension[reporte_comprension['experto_en']=='terrier']
df_1 = df_1[['jugador', 'Treatment', 'experto_en', 'GradingB', 'GradingD']]
df_1 = pd.melt(df_1, ['jugador', 'Treatment'], ['GradingB', 'GradingD'])
df_1['variable'] = df_1['variable'].apply(lambda x: x[-1])
df_1 = df_1[['Treatment', 'jugador', 'value', 'variable']]
df_1.columns = ['Treatment', 'jugador', 'Report', 'raza']

df_2 = reporte_comprension[reporte_comprension['experto_en']=='hound']
df_2 = df_2[['jugador', 'Treatment', 'experto_en', 'GradingA', 'GradingC']]
df_2 = pd.melt(df_2, ['jugador', 'Treatment'], ['GradingA', 'GradingC'])
df_2['variable'] = df_2['variable'].apply(lambda x: x[-1])
df_2 = df_2[['Treatment', 'jugador', 'value', 'variable']]
df_2.columns = ['Treatment', 'jugador', 'Report', 'raza']

df_novatos = pd.concat([df_1, df_2])
df_novatos['Expertise'] = 'Novices'
df_novatos.head()


In [None]:
df_1 = reporte_comprension[reporte_comprension['experto_en']=='terrier']
df_1 = df_1[['jugador', 'Treatment', 'experto_en', 'GradingA', 'GradingC']]
df_1 = pd.melt(df_1, ['jugador', 'Treatment'], ['GradingA', 'GradingC'])
df_1['variable'] = df_1['variable'].apply(lambda x: x[-1])
df_1 = df_1[['Treatment', 'jugador', 'value', 'variable']]
df_1.columns = ['Treatment', 'jugador', 'Report', 'raza']

df_2 = reporte_comprension[reporte_comprension['experto_en']=='hound']
df_2 = df_2[['jugador', 'Treatment', 'experto_en', 'GradingB', 'GradingD']]
df_2 = pd.melt(df_2, ['jugador', 'Treatment'], ['GradingB', 'GradingD'])
df_2['variable'] = df_2['variable'].apply(lambda x: x[-1])
df_2 = df_2[['Treatment', 'jugador', 'value', 'variable']]
df_2.columns = ['Treatment', 'jugador', 'Report', 'raza']

df_expertos = pd.concat([df_1, df_2])
df_expertos['Expertise'] = 'Experts'
df_expertos.head()

In [None]:
reporte_comprension = pd.concat([df_expertos, df_novatos])
reporte_comprension.head()

In [None]:
# Expertos entre tratamientos
x = reporte_comprension[(reporte_comprension['Treatment']=='Single') & (reporte_comprension['Expertise']=='Experts')]['Report']
y = reporte_comprension[(reporte_comprension['Treatment']=='Dyad') & (reporte_comprension['Expertise']=='Experts')]['Report']
print("Diferencia de medias entre expertos en los dos tratamientos:", stats.ttest_ind(x,y).pvalue)

# Novatos entre tratamientos
x = reporte_comprension[(reporte_comprension['Treatment']=='Single') & (reporte_comprension['Expertise']=='Novices')]['Report']
y = reporte_comprension[(reporte_comprension['Treatment']=='Dyad') & (reporte_comprension['Expertise']=='Novices')]['Report']
print("Diferencia de medias entre novatos en los dos tratamientos:", stats.ttest_ind(x,y).pvalue)

# Individual entre experticia
x = reporte_comprension[(reporte_comprension['Treatment']=='Single') & (reporte_comprension['Expertise']=='Experts')]['Report']
y = reporte_comprension[(reporte_comprension['Treatment']=='Single') & (reporte_comprension['Expertise']=='Novices')]['Report']
print("Diferencia de medias entre experticia en individuos:", stats.ttest_ind(x,y).pvalue)

# Parejas entre experticia
x = reporte_comprension[(reporte_comprension['Treatment']=='Dyad') & (reporte_comprension['Expertise']=='Experts')]['Report']
y = reporte_comprension[(reporte_comprension['Treatment']=='Dyad') & (reporte_comprension['Expertise']=='Novices')]['Report']
print("Diferencia de medias entre experticia en parejas:", stats.ttest_ind(x,y).pvalue)


In [None]:
df_comp = data[data['Stage']=='Game rounds']
#df_comp = df_comp[df_comp['Round'] > 15]
df_comp = df_comp[['Player', 'Kind', 'Round', 'Correct', 'Treatment']]
df_comp.columns = ['jugador', 'raza', 'ronda', 'clasificacion_correcta', 'Treatment']
df_comp = df_comp.groupby(['Treatment', 'jugador', 'raza'])['clasificacion_correcta'].mean().reset_index()
reporte_comprension = pd.merge(df_comp, reporte_comprension, on=['Treatment', 'jugador', 'raza'])#, how = 'outer')
reporte_comprension.tail()


In [None]:
df_comp = df1.copy()
#df_comp = df_comp[df_comp['Round'] > 15]
df_comp = df_comp[['jugador', 'raza', 'ronda', 'rotulo_enviado']]
df_comp['envio'] = df_comp['rotulo_enviado'].apply(lambda x: 'Si' if (x=='A') or (x=='B') or (x=='C') or (x=='D') else 'No')
df_comp = df_comp.groupby(['jugador', 'raza'])['envio'].value_counts(normalize=True).reset_index(name='query')
df_comp = df_comp[df_comp['envio']=='Si']
del df_comp['envio']
reporte_comprension_novatos = pd.merge(reporte_comprension[reporte_comprension['Expertise']=='Novices'], df_comp, on=['jugador', 'raza'])
reporte_comprension_novatos.head()


In [None]:
dfA = df1[df1['rotulo_enviado'].isnull()==False]
df = dfA.groupby(['jugador', 'raza'])['respondido'].value_counts(normalize=True).reset_index(name='answered')
df = df[df['respondido']=='Si']
del df['respondido']
reporte_comprension_novatos = pd.merge(reporte_comprension_novatos, df, on=['jugador', 'raza'])
reporte_comprension_novatos.head()


In [None]:
df_novatos_single = df_novatos[df_novatos['Treatment']=='Single']
df_novatos_single = df_novatos_single.groupby('jugador')['Report'].mean().reset_index()
df_novatos_single.head()


In [None]:
dfN_ = dfN[dfN['Treatment']=='Single']
dfN_ = dfN_.groupby('jugador')['clasificacion_correcta'].mean().reset_index()
df_novatos_single = pd.merge(df_novatos_single, dfN_, on='jugador')
df_novatos_single.head()


In [None]:
dict_razas = {'A':'Cairn\n Terrier', 'C':'Norwich\n Terrier', 'B':'Irish\n Wolf-\n hound', 'D':'Scottish\n Deer-\nhound'}
dict_orden = {'A':1, 'C':2, 'B':3, 'D':4}
df_novatos_terriers = df1[df1['novato_en'] == 'terrier']
df = df_novatos_terriers.groupby('raza')['rotulo_enviado'].value_counts(normalize=True, dropna=False).reset_index(name='conteo')
df['preguntar'] = df['rotulo_enviado'].apply(lambda x: 0 if pd.isnull(x) else 1)
df = df[df['preguntar'] == 0]
df['pregunto'] = (1 - df['conteo']) * 100
df['indice'] = df['raza']
df['raza'] = df['indice'].map(dict_razas)
df['indice'] = df['indice'].map(dict_orden)
df.sort_values(by='indice', inplace=True)
df['Novice in'] = 'terriers'
df_preguntas_terriers = df
df_novatos_hounds = df1[df1['novato_en'] == 'hound']
df = df_novatos_hounds.groupby('raza')['rotulo_enviado'].value_counts(normalize=True, dropna=False).reset_index(name='conteo')
df['preguntar'] = df['rotulo_enviado'].apply(lambda x: 0 if pd.isnull(x) else 1)
df = df[df['preguntar'] == 0]
df['pregunto'] = (1 - df['conteo']) * 100
df['indice'] = df['raza']
df['raza'] = df['indice'].map(dict_razas)
df['indice'] = df['indice'].map(dict_orden)
df.sort_values(by='indice', inplace=True)
df['Novice in'] = 'hounds'
df_preguntas_hounds = df
df_preguntas = pd.concat([df_preguntas_terriers, df_preguntas_hounds])
df_preguntas.head()

In [None]:
data_training = data[data['Stage']=='Training rounds']

fig, ax = plt.subplots(1, 3, figsize=(10,3), dpi=600)
sns.lineplot(x='Round', y='Correct', hue='Treatment', data=data_training, ci=95, ax=ax[0])
sns.lineplot(x='ronda', y='clasificacion_correcta', hue='Treatment', data=dfE, ci=95, ax=ax[1])
sns.lineplot(x='ronda', y='clasificacion_correcta', hue='Treatment', data=dfN, ci=95, ax=ax[2])
ax[0].set_ylim(0.4, 1)
ax[0].set_xlabel('Training rounds')
ax[0].set_ylabel('% Classification success')
ax[0].set_ylim(0.4, 1)
ax[1].set_title("Expert-Dogs")
ax[1].set_xlabel("Game rounds")
ax[1].set_ylabel('% Classification success')
ax[1].set_ylim(0.4, 1)
ax[2].set_title("Novice-Dogs")
ax[2].set_xlabel("Game rounds")
ax[2].set_ylabel("")
ax[2].set_ylim(0.4, 1)
ax[2].get_legend().remove()

fig.tight_layout()


In [None]:
fig, ax = plt.subplots(2, 2, figsize=(6, 6), dpi=600)

sns.barplot(x='raza', y='pregunto', hue='Novice in', data=df_preguntas, ax=ax[0, 0])
ax[0, 0].set_xlabel("Dog shown to player")
ax[0, 0].set(ylim=(0, 60))
ax[0, 0].set_ylabel("% of queries")
ax[0, 0].legend(title="Novices in", ncol=2, loc='upper center')#bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=2, mode="expand", borderaxespad=0.)

df_respuestas = pd.merge(df_respondido, df_correcto, on='raza')
respondido = df_respuestas['respondido'].mean()
correcto = df_respuestas['respuesta_correcta'].mean()
sns.barplot(x=['answered', 'correct'], y=[100, 100], color = "cyan", ax=ax[0, 1])
sns.barplot(x=['answered', 'correct'], y=[respondido*100, correcto*100], color = "green", ax=ax[0, 1])
ax[0, 1].set_title("Experts' answers")
ax[0, 1].set_ylabel("% of messages")
topbar = plt.Rectangle((0,0),1,1,fc="cyan", edgecolor = 'none')
bottombar = plt.Rectangle((0,0),1,1,fc='green',  edgecolor = 'none')
ax[0, 1].legend([bottombar, topbar], ['Yes', 'No'], loc='lower right', ncol = 1, prop={'size':10})


df_msgs = df1.groupby(['jugador', 'ronda'])['rotulo_enviado'].count().reset_index(name='#msg')
sns.lineplot(x='ronda', y='#msg', data=df_msgs, ax=ax[1, 0], ci=95)
ax[1, 0].set_xlabel("Game rounds")
ax[1, 0].set(ylim=(0, 2))
ax[1, 0].set_ylabel("Av. # of queries")
ax[1, 0].set_title("Number of queries per round")

sns.regplot(x='query', y='clasificacion_correcta', data=reporte_comprension_novatos, ax=ax[1, 1])
ax[1, 1].set_title('Accuracy vs. Queries')
ax[1, 1].set_xlabel('Rate of dogs queried')
ax[1, 1].set_ylabel('Accuracy')
ax[1, 1].set(xlim=(0, 1.05))

fig.tight_layout()


In [None]:
reporte_comprension_novatos[['query', 'clasificacion_correcta']].corr()

In [None]:
parejas_dict = {}
for pareja, grp in df1.groupby('pareja'):
    jugadores = grp.jugador.unique()
    parejas_dict[jugadores[0]] = jugadores[1]
    parejas_dict[jugadores[1]] = jugadores[0]
    
#parejas_dict

In [None]:
dfA = df1[df1['rotulo_enviado'].isnull()==False]
df = dfA.groupby(['jugador', 'raza'])['respondido'].value_counts(normalize=True).reset_index(name='answered')
df = df[df['respondido']=='Si']
del df['respondido']
df['jugador'] = df['jugador'].map(parejas_dict)
df['llave'] = df.apply(lambda x: (x['jugador'], x['raza']), axis=1)
print(df.head())
dict_respuesta = dict(zip(df['llave'], df['answered']))
#dict_respuesta

In [None]:
reporte_comprension_expertos = pd.DataFrame(reporte_comprension[reporte_comprension['Expertise']=='Experts'])
reporte_comprension_expertos['llave'] = reporte_comprension_expertos.apply(lambda x: (x['jugador'], x['raza']), axis=1)
reporte_comprension_expertos['answered'] = reporte_comprension_expertos['llave'].map(dict_respuesta)
del reporte_comprension_expertos['llave']
reporte_comprension_expertos = reporte_comprension_expertos[reporte_comprension_expertos['Treatment']=='Dyad']
reporte_comprension_expertos = reporte_comprension_expertos.dropna()
reporte_comprension_expertos.head()


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(9,3), dpi=600)
sns.boxplot(x='Expertise', y='Report', hue='Treatment', data=reporte_comprension, ax=axes[0])
sns.regplot(y='Report', x='clasificacion_correcta', data=reporte_comprension_expertos, ax=axes[1])
sns.regplot(y='Report', x='answered', data=reporte_comprension_expertos, ax=axes[2])
axes[0].set_title('Distribution of Report')
axes[0].set_ylabel('Report on term understanding')
axes[0].set(ylim=(0.75, 7.25))
axes[1].set_title('Report vs. Accuracy')
axes[1].set_xlabel('Accuracy')
axes[1].set_ylabel('')
axes[1].set(xlim=(0.05, 1.05))
axes[1].set(ylim=(0.75, 7.25))
axes[2].set_title('Report vs. Answers')
axes[2].set_xlabel('Rate of queries answered')
axes[2].set_ylabel('')
axes[2].set(xlim=(0.05, 1.05))
axes[2].set(ylim=(0.75, 7.25))
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(6,6), dpi=600)
sns.regplot(x='clasificacion_correcta', y='Report', data=df_novatos_single, ax=axes[0,0])
sns.regplot(x='clasificacion_correcta', y='Report', data=reporte_comprension_novatos, ax=axes[0,1])
sns.regplot(x='query', y='Report', data=reporte_comprension_novatos, ax=axes[1,0])
sns.regplot(x='answered', y='Report', data=reporte_comprension_novatos, ax=axes[1,1])
axes[0,0].set_title('Treatment Single\n Report vs. Accuracy')
axes[0,0].set_xlabel('Accuracy')
axes[0,0].set_ylabel('')
axes[0,0].set(xlim=(0.45, 1.05))
axes[0,0].set(ylim=(0.75, 7.25))
axes[0,1].set_title('Treatment Dyad\n Report vs. Accuracy')
axes[0,1].set_xlabel('Accuracy')
axes[0,1].set_ylabel('')
axes[0,1].set(xlim=(0.45, 1.05))
#axes[0,0].set(ylim=(1, 7))

axes[1,0].set_title('Report vs. Queries')
axes[1,0].set_xlabel('Rate of dogs queried')
axes[1,0].set_ylabel('')
axes[1,0].set(xlim=(0, 1.05))
#axes[0,1].set(ylim=(1, 7))
axes[1,1].set_title('Report vs. Answers')
axes[1,1].set_xlabel('Rate of queries answered')
axes[1,1].set_ylabel('')
axes[1,1].set(xlim=(0, 1.05))
#axes[0,2].set(ylim=(1, 7))

fig.tight_layout()

### Regression models

In [None]:
df_novatos_single[['Report', 'clasificacion_correcta']].corr()

In [None]:
model = ols("Report ~ clasificacion_correcta", data=df_novatos_single)
results = model.fit()
results.summary()

In [None]:
corr_mat = reporte_comprension_novatos[['Report', 'clasificacion_correcta', 'query', 'answered']].corr()
corr_mat

In [None]:
sns.heatmap(corr_mat,cmap=sns.diverging_palette(220, 10, as_cmap=True))
plt.title("Correlation Matrix")

In [None]:
model = ols("Report ~ clasificacion_correcta + query + answered", data=reporte_comprension_novatos)
results = model.fit()
results.summary()

In [None]:
model = ols("Report ~ clasificacion_correcta + query * answered", data=reporte_comprension_novatos)
results = model.fit()
results.summary()

In [None]:
model = ols("clasificacion_correcta ~ query + answered", data=reporte_comprension_novatos)
results = model.fit()
results.summary()

In [None]:
corr_mat = df[['Report', 'clasificacion_correcta', 'answered']].corr()
corr_mat

In [None]:
sns.heatmap(corr_mat,cmap=sns.diverging_palette(220, 10, as_cmap=True))
plt.title("Correlation Matrix")

In [None]:
model = ols("Report ~ clasificacion_correcta + answered", data=df)
results = model.fit()
results.summary()

In [None]:
variables = ['Report', 'clasificacion_correcta', 'answered']
for var in variables:
    datos = df[var]
    datos_ajustados, fitted_lambda = stats.boxcox(datos)
    print(f"Box-Cox {var} = {round(fitted_lambda,2)}")
    df[var + '_adj'] = datos_ajustados

df.head()

In [None]:
original_data = df['answered']
  
# transform training data & save lambda value 
fitted_data, fitted_lambda = stats.boxcox(original_data) 
  
# creating axes to draw plots 
fig, ax = plt.subplots(1, 2) 
  
# plotting the original data(non-normal) and  
# fitted data (normal) 
sns.distplot(original_data, hist = False, kde = True, 
            kde_kws = {'shade': True, 'linewidth': 2},  
            label = "Non-Normal", color ="green", ax = ax[0]) 
  
sns.distplot(fitted_data, hist = False, kde = True, 
            kde_kws = {'shade': True, 'linewidth': 2},  
            label = "Normal", color ="green", ax = ax[1])

In [None]:
model = ols("Report_adj ~ clasificacion_correcta_adj + answered_adj", data=df)
results = model.fit()
results.summary()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12,4))
sns.distplot(reporte_comprension_novatos['Report'], ax=axes[0])
sns.distplot(reporte_comprension_novatos['clasificacion_correcta'], ax=axes[1])
sns.distplot(reporte_comprension_novatos['%query'], ax=axes[2])

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(9,3), dpi=300)
sns.regplot(y='Report', x='clasificacion_correcta', data=reporte_comprension_novatos, ax=axes[1])
sns.regplot(y='Report', x='%query', data=reporte_comprension_novatos, ax=axes[2])
