In [1]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

## Read data

In [2]:
def string_to_list(col):
    if '.' in col:
        return [float(item) for item in col.strip('[]').split(',')]
    else:
        return [int(item) for item in col.strip('[]').split(',')]

In [3]:
basedir = os.path.join("..","data")
developer_data = pd.read_csv(os.path.join(basedir, 'contributor_features.csv'), index_col = 0)
developer_sequence = pd.read_csv(os.path.join(basedir,'contributor_activity_sequence.csv'), index_col=0)
for col_name in developer_sequence:
    if 'sequence' in col_name:
        developer_sequence[col_name] = developer_sequence[col_name].apply(string_to_list)


In [4]:
filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)

## workload composition 

### Distribution of major worload composition patterns across profiles

In [5]:
def major_pattern(lst):
    counter = Counter(lst)
    max_count = max(counter.values())
    modes = [key for key, count in counter.items() if count == max_count]
    return min(modes)

In [6]:
developer_sequence['major pattern'] = developer_sequence['workload_sequence'].apply(major_pattern)
developer_sequence.groupby(['profile','major pattern'])['major pattern'].count()/developer_sequence['profile'].value_counts()

profile  major pattern
ca       1                0.175910
         2                0.156846
         3                0.441075
         4                0.204506
         5                0.021664
cw       1                0.187551
         2                0.175266
         3                0.418509
         4                0.190827
         5                0.027846
pa       1                0.299421
         2                0.187345
         3                0.354839
         4                0.156328
         5                0.002068
pw       1                0.277695
         2                0.176580
         3                0.392193
         4                0.148327
         5                0.005204
dtype: float64

### Chi-square test and Cramer's V index

In [7]:
pa = developer_sequence.loc[developer_sequence['profile']=='pa']
pw = developer_sequence.loc[developer_sequence['profile']=='pw']
ca = developer_sequence.loc[developer_sequence['profile']=='ca']
cw = developer_sequence.loc[developer_sequence['profile']=='cw']
core = developer_sequence.loc[developer_sequence['profile'].isin(['ca','cw'])]
peri = developer_sequence.loc[developer_sequence['profile'].isin(['pa','pw'])]

In [8]:
def cramers_v(contingency_table, chi2):
    contingency_table = np.array(contingency_table)
    n = np.sum(contingency_table)
    v = np.sqrt(chi2 / (n * (min(np.array(contingency_table).shape) - 1)))
    return v

In [10]:
# compare core and periphral contributors
x1 = core['major pattern'].value_counts().sort_index().to_list()
x2 = peri['major pattern'].value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

Chi-Square Statistic: 175.82633356744728
P-value: 5.871686473518987e-37
Degrees of Freedom: 4
Cramer's V: 0.15328656330144372


In [11]:
# compare core-afterhour and core-workhour contributors
x1 = ca['major pattern'].value_counts().sort_index().to_list()
x2 = cw['major pattern'].value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

Chi-Square Statistic: 3.8307128777585735
P-value: 0.42940087789801107
Degrees of Freedom: 4
Cramer's V: 0.04016132141754813


In [12]:
# compare periphral-afterhour and periphral-workhour contributors
x1 = pa['major pattern'].value_counts().sort_index()
x2 = pw['major pattern'].value_counts().sort_index()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

Chi-Square Statistic: 11.547677458325337
P-value: 0.021051766936966296
Degrees of Freedom: 4
Cramer's V: 0.04754686403153862


## work preference

In [13]:
work_preference_features = ['binned_entropy', 'c3(1)', 'c3(2)', 'c3(3)', 'number_cwt_peaks','longest_strike_above_mean', 
                            'longest_strike_below_mean','diverse', 'balance']
pa = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='pa']
pw = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='pw']
ca = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='ca']
cw = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='cw']
core = filtered_developer_period_df.loc[filtered_developer_period_df['profile'].isin(['ca','cw'])]
peri = filtered_developer_period_df.loc[filtered_developer_period_df['profile'].isin(['pa','pw'])]

In [14]:
# compare core and peripheral contributors
for fea in work_preference_features:
    x1, x2 = core[fea],peri[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

binned_entropy large (0.497)
c3(1) medium (0.343)
c3(2) small (0.292)
c3(3) small (0.292)
number_cwt_peaks large (0.491)
longest_strike_above_mean medium (0.377)
longest_strike_below_mean medium (-0.444)
diverse small (0.284)
balance medium (-0.339)


In [15]:
# compare core-afterhour and core-workhour contributors
for fea in work_preference_features:
    x1, x2 = ca[fea],cw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

binned_entropy negligible (-0.055)
c3(1) negligible (-0.079)
c3(2) negligible (-0.066)
c3(3) negligible (-0.07)
number_cwt_peaks negligible (-0.034)
longest_strike_above_mean negligible (-0.052)
longest_strike_below_mean not significant
diverse negligible (-0.144)
balance negligible (0.042)


In [16]:
# compare peripheral-afterhour and peripheral-workhour contributors
for fea in work_preference_features:
    x1, x2 = pa[fea],pw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

binned_entropy not significant
c3(1) negligible (0.016)
c3(2) negligible (0.01)
c3(3) negligible (0.012)
number_cwt_peaks not significant
longest_strike_above_mean negligible (0.026)
longest_strike_below_mean not significant
diverse not significant
balance negligible (-0.045)


### technical importance

In [18]:
pa = developer_sequence.loc[developer_sequence['profile']=='pa']
pw = developer_sequence.loc[developer_sequence['profile']=='pw']
ca = developer_sequence.loc[developer_sequence['profile']=='ca']
cw = developer_sequence.loc[developer_sequence['profile']=='cw']
core = developer_sequence.loc[developer_sequence['profile'].isin(['ca','cw'])]
peri = developer_sequence.loc[developer_sequence['profile'].isin(['pa','pw'])]

In [22]:
technical_importance_metric = [ 'max_period_centrality', 'max_centrality_period','max_commit_centrality', 'max_centrality_day']

In [24]:
# compare core and peripheral contributors
for fea in technical_importance_metric:
    x1, x2 = core[fea],peri[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

max_period_centrality large (0.613)
max_centrality_period medium (0.447)
max_commit_centrality large (0.595)
max_centrality_day large (0.522)


In [25]:
# compare core-afterhour and core-workhour contributors
for fea in technical_importance_metric:
    x1, x2 = ca[fea],pa[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

max_period_centrality large (0.617)
max_centrality_period medium (0.469)
max_commit_centrality large (0.602)
max_centrality_day large (0.542)


In [23]:
# compare peripheral-afterhour and peripheral-workhour contributors
for fea in technical_importance_metric:
    x1, x2 = pa[fea],pw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

max_period_centrality negligible (-0.077)
max_centrality_period not significant
max_commit_centrality negligible (-0.083)
max_centrality_day not significant
