In [1]:
import pyrqa
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy

In [2]:
data = pd.read_csv('./data/rqa_results-2.csv')

In [3]:
data.columns

Index(['Unnamed: 0', 'determinism', 'recurrence_rate', 'entropy',
       'longest_diagonal_line', 'average_diagonal_line', 'laminarity',
       'divergence', 'no_messages', 'diversity', 'diversity_bin', 'time',
       'high_performance', 'female_majority', 'revise_score', 'full_credit',
       'half_credit', 'task', 'task_numeric'],
      dtype='object')

In [4]:
data.drop(columns='Unnamed: 0', inplace=True)

In [5]:
data.head(10)

Unnamed: 0,determinism,recurrence_rate,entropy,longest_diagonal_line,average_diagonal_line,laminarity,divergence,no_messages,diversity,diversity_bin,time,high_performance,female_majority,revise_score,full_credit,half_credit,task,task_numeric
0,0.561449,0.26693,1.083095,8,2.630303,0.725369,0.125,78,3,1,1169,0,1,0.0,0,0,Professor,0
1,0.40748,0.215632,0.768094,8,2.435294,0.681693,0.125,71,2,0,731,0,1,0.0,0,0,Professor,0
2,0.276596,0.202938,0.655482,3,2.363636,0.538462,0.333333,33,3,1,684,0,0,0.0,0,0,Professor,0
3,0.455988,0.208333,0.711437,6,2.323529,0.604082,0.166667,84,4,1,1699,0,0,0.0,0,0,Professor,0
4,0.467005,0.20794,0.755876,4,2.358974,0.663636,0.25,46,1,0,869,0,1,0.0,0,0,Professor,0
5,0.589666,0.290905,1.185261,9,2.771429,0.754513,0.111111,69,3,1,1295,0,1,0.0,0,0,Professor,0
6,0.612751,0.284089,1.352437,11,3.052941,0.811619,0.090909,79,4,1,856,1,0,2.0,1,1,Professor,0
7,0.470994,0.270756,0.992356,6,2.544776,0.615233,0.166667,75,3,1,752,0,1,0.0,0,0,Professor,0
8,0.422727,0.183642,0.685185,7,2.325,0.620798,0.142857,72,2,0,1452,0,1,0.5,0,0,Professor,0
9,0.622807,0.290352,1.139996,8,2.704762,0.816273,0.125,81,2,0,742,1,1,2.0,1,1,Professor,0


In [6]:
measures = list(data.columns[:-2])
measures.append('task_numeric')

In [7]:
measures

['determinism',
 'recurrence_rate',
 'entropy',
 'longest_diagonal_line',
 'average_diagonal_line',
 'laminarity',
 'divergence',
 'no_messages',
 'diversity',
 'diversity_bin',
 'time',
 'high_performance',
 'female_majority',
 'revise_score',
 'full_credit',
 'half_credit',
 'task_numeric']

In [8]:
results = []
feature_anova = dict()
for i in measures:
    task_dif = data.groupby(['task'])[i].apply(list)
    results.append(scipy.stats.f_oneway(task_dif['PartyVenue'], task_dif['Professor'], 
                                                  task_dif['apartment'], task_dif['candidate']).pvalue)
feature_anova['task'] = results



In [9]:
results = []
for i in measures:
    div_dif = data.groupby(['diversity'])[i].apply(list)
    results.append(scipy.stats.f_oneway(div_dif[1], div_dif[2], div_dif[3], div_dif[4]).pvalue)
    
feature_anova['diversity'] = results

In [10]:
data.groupby(['diversity'])[measures].mean()

Unnamed: 0_level_0,determinism,recurrence_rate,entropy,longest_diagonal_line,average_diagonal_line,laminarity,divergence,no_messages,diversity,diversity_bin,time,high_performance,female_majority,revise_score,full_credit,half_credit,task_numeric
diversity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0.511155,0.237073,0.951707,6.285714,2.890521,0.707824,0.215873,39.857143,1.0,0.0,715.142857,0.142857,0.857143,0.214286,0.0,0.142857,1.571429
2,0.494225,0.247301,0.916197,6.404762,2.715159,0.665768,0.218953,49.952381,2.0,0.0,727.333333,0.214286,0.714286,0.39881,0.142857,0.214286,1.428571
3,0.459664,0.247818,0.810624,5.651515,2.432765,0.654339,0.231162,59.69697,3.0,1.0,892.181818,0.318182,0.575758,0.429924,0.060606,0.30303,1.666667
4,0.537963,0.243923,1.177452,10.642857,3.132262,0.699755,0.129659,74.0,4.0,1.0,1090.857143,0.285714,0.5,0.5625,0.214286,0.285714,1.571429


In [11]:
results = []
for i in measures:
    fm_dif = data.groupby(['female_majority'])[i].apply(list)
    results.append(scipy.stats.f_oneway(fm_dif[0], fm_dif[1]).pvalue)
    
feature_anova['female_majority'] = results



In [12]:
feature_anova

{'task': [0.4140064409416647,
  0.23655114124622129,
  0.11021516969209363,
  0.4419534485201636,
  0.15034631187219255,
  0.23067887028412323,
  0.42699244288734184,
  0.6488659776833008,
  0.9168457026591816,
  0.6642659650094789,
  0.32306526044125283,
  0.0018506016001086957,
  0.22140309840385092,
  0.001334177355695501,
  0.06484052761707572,
  0.003401818262476812,
  0.0],
 'diversity': [0.25479039809814213,
  0.959151308592697,
  0.030325880144353848,
  0.0007112715289587272,
  0.004498722565711939,
  0.5252828027384496,
  0.1648422016847999,
  0.03364259758806267,
  0.0,
  0.0,
  0.014920851192695345,
  0.5757780013564215,
  0.20169380285677552,
  0.7465481634140867,
  0.1927143544477594,
  0.6634791923968448,
  0.7465134477320103],
 'female_majority': [0.7149185430682055,
  0.24801271768689662,
  0.6688686766594323,
  0.48366578043902,
  0.9856444368987245,
  0.20015128554692901,
  0.7335573389218166,
  0.1812919575087236,
  0.03344747316940811,
  0.0500603692189969,
  0.1333

In [13]:
results = []
for i in measures:
    perf_dif = data.groupby(['high_performance'])[i].apply(list)
    results.append(scipy.stats.f_oneway(perf_dif[0], perf_dif[1]).pvalue)
    
feature_anova['high_performance'] = results

In [14]:
results = []
for i in measures:
    divbin_dif = data.groupby(['diversity_bin'])[i].apply(list)
    results.append(scipy.stats.f_oneway(divbin_dif[0], divbin_dif[1]).pvalue)
    
feature_anova['diversity_bin'] = results

In [15]:
feature_anova

{'task': [0.4140064409416647,
  0.23655114124622129,
  0.11021516969209363,
  0.4419534485201636,
  0.15034631187219255,
  0.23067887028412323,
  0.42699244288734184,
  0.6488659776833008,
  0.9168457026591816,
  0.6642659650094789,
  0.32306526044125283,
  0.0018506016001086957,
  0.22140309840385092,
  0.001334177355695501,
  0.06484052761707572,
  0.003401818262476812,
  0.0],
 'diversity': [0.25479039809814213,
  0.959151308592697,
  0.030325880144353848,
  0.0007112715289587272,
  0.004498722565711939,
  0.5252828027384496,
  0.1648422016847999,
  0.03364259758806267,
  0.0,
  0.0,
  0.014920851192695345,
  0.5757780013564215,
  0.20169380285677552,
  0.7465481634140867,
  0.1927143544477594,
  0.6634791923968448,
  0.7465134477320103],
 'female_majority': [0.7149185430682055,
  0.24801271768689662,
  0.6688686766594323,
  0.48366578043902,
  0.9856444368987245,
  0.20015128554692901,
  0.7335573389218166,
  0.1812919575087236,
  0.03344747316940811,
  0.0500603692189969,
  0.1333

In [16]:
anova_results = pd.DataFrame(feature_anova).transpose()
anova_results.columns = measures

In [17]:
anova_results
anova_results.style.apply(lambda x: ["background: yellow" if v <= 0.05 else "" for v in x], axis = 1)

Unnamed: 0,determinism,recurrence_rate,entropy,longest_diagonal_line,average_diagonal_line,laminarity,divergence,no_messages,diversity,diversity_bin,time,high_performance,female_majority,revise_score,full_credit,half_credit,task_numeric
task,0.414006,0.236551,0.110215,0.441953,0.150346,0.230679,0.426992,0.648866,0.916846,0.664266,0.323065,0.001851,0.221403,0.001334,0.064841,0.003402,0.0
diversity,0.25479,0.959151,0.030326,0.000711,0.004499,0.525283,0.164842,0.033643,0.0,0.0,0.014921,0.575778,0.201694,0.746548,0.192714,0.663479,0.746513
female_majority,0.714919,0.248013,0.668869,0.483666,0.985644,0.200151,0.733557,0.181292,0.033447,0.05006,0.133328,0.422013,0.0,0.384022,0.193504,0.33533,0.670973
high_performance,0.221933,0.007199,0.137403,0.499779,0.730242,0.799658,0.296169,0.005485,0.241816,0.181631,0.00686,0.0,0.422013,0.0,0.0,0.0,0.844415
diversity_bin,0.387213,0.89227,0.553937,0.858495,0.167333,0.685132,0.855008,0.017078,0.0,0.0,0.005784,0.181631,0.05006,0.525582,0.525909,0.233333,0.307131


In [18]:
anova_results.to_csv('./data/anova_res.csv')

In [19]:
data.groupby(['female_majority', 'diversity_bin'])[measures].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,determinism,recurrence_rate,entropy,longest_diagonal_line,average_diagonal_line,laminarity,divergence,no_messages,diversity,diversity_bin,time,high_performance,female_majority,revise_score,full_credit,half_credit,task_numeric
female_majority,diversity_bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0,0.478789,0.251583,0.921756,5.846154,2.589757,0.616782,0.251437,46.384615,1.923077,0.0,679.538462,0.384615,0.0,0.692308,0.307692,0.384615,1.461538
0,1,0.474966,0.254346,0.910616,7.171429,2.640815,0.658283,0.21017,67.628571,3.2,1.0,1009.657143,0.285714,0.0,0.417857,0.085714,0.285714,1.542857
1,0,0.503091,0.243766,0.921094,6.583333,2.794541,0.691635,0.206624,49.277778,1.833333,0.0,742.222222,0.138889,1.0,0.256944,0.055556,0.138889,1.444444
1,1,0.472123,0.241529,0.846977,6.022222,2.488569,0.665401,0.21591,57.977778,3.155556,1.0,862.622222,0.333333,1.0,0.480556,0.088889,0.311111,1.733333


In [20]:
data.groupby(['female_majority'])[["determinism", "recurrence_rate", "entropy", "longest_diagonal_line", 
                                     "average_diagonal_line", "laminarity",
                 "divergence", "female_majority", "high_performance"]].mean()

Unnamed: 0_level_0,determinism,recurrence_rate,entropy,longest_diagonal_line,average_diagonal_line,laminarity,divergence,female_majority,high_performance
female_majority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.476001,0.253598,0.913633,6.8125,2.626987,0.647043,0.221347,0.0,0.3125
1,0.485886,0.242524,0.879918,6.271605,2.624557,0.677061,0.211783,1.0,0.246914
