In [1]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

## Read data

In [2]:
def string_to_list(col):
    if '.' in col:
        return [float(item) for item in col.strip('[]').split(',')]
    else:
        return [int(item) for item in col.strip('[]').split(',')]

In [3]:
basedir = os.path.join("..","data")
developer_data = pd.read_csv(os.path.join(basedir, 'contributor_features.csv'), index_col = 0)
developer_sequence = pd.read_csv(os.path.join(basedir,'contributor_activity_sequence.csv'), index_col=0)
for col_name in developer_sequence:
    if 'sequence' in col_name:
        developer_sequence[col_name] = developer_sequence[col_name].apply(string_to_list)


In [4]:
filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)

## workload composition 

### Distribution of major worload composition patterns across profiles

In [7]:
def major_pattern(lst):
    counter = Counter(lst)
    max_count = max(counter.values())
    modes = [key for key, count in counter.items() if count == max_count]
    return max(modes)

In [None]:
developer_sequence['major pattern'] = developer_sequence['workload_sequence'].apply(major_pattern)
developer_sequence.groupby(['profile','major pattern'])['major pattern'].count()/developer_sequence['profile'].value_counts()

### Chi-square test and Cramer's V index

In [9]:
pa = developer_sequence.loc[developer_sequence['profile']=='pa']
pw = developer_sequence.loc[developer_sequence['profile']=='pw']
ca = developer_sequence.loc[developer_sequence['profile']=='ca']
cw = developer_sequence.loc[developer_sequence['profile']=='cw']
core = developer_sequence.loc[developer_sequence['profile'].isin(['ca','cw'])]
peri = developer_sequence.loc[developer_sequence['profile'].isin(['pa','pw'])]

In [10]:
def cramers_v(contingency_table, chi2):
    contingency_table = np.array(contingency_table)
    n = np.sum(contingency_table)
    v = np.sqrt(chi2 / (n * (min(np.array(contingency_table).shape) - 1)))
    return v

In [None]:
# compare core and periphral contributors
x1 = core['major pattern'].value_counts().sort_index().to_list()
x2 = peri['major pattern'].value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

In [None]:
# compare core-afterhour and core-workhour contributors
x1 = ca['major pattern'].value_counts().sort_index().to_list()
x2 = cw['major pattern'].value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

In [None]:
# compare periphral-afterhour and periphral-workhour contributors
x1 = pa['major pattern'].value_counts().sort_index()
x2 = pw['major pattern'].value_counts().sort_index()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

## work preference

In [15]:
work_preference_features = ['binned_entropy', 'c3(1)', 'c3(2)', 'c3(3)', 'number_cwt_peaks','longest_strike_above_mean', 
                            'longest_strike_below_mean','diverse', 'balance']
pa = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='pa']
pw = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='pw']
ca = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='ca']
cw = filtered_developer_period_df.loc[filtered_developer_period_df['profile']=='cw']
core = filtered_developer_period_df.loc[filtered_developer_period_df['profile'].isin(['ca','cw'])]
peri = filtered_developer_period_df.loc[filtered_developer_period_df['profile'].isin(['pa','pw'])]

In [None]:
# compare core and peripheral contributors
for fea in work_preference_features:
    x1, x2 = core[fea],peri[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

In [None]:
# compare core-afterhour and core-workhour contributors
for fea in work_preference_features:
    x1, x2 = ca[fea],cw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

In [None]:
# compare peripheral-afterhour and peripheral-workhour contributors
for fea in work_preference_features:
    x1, x2 = pa[fea],pw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

### technical importance

In [19]:
pa = developer_sequence.loc[developer_sequence['profile']=='pa']
pw = developer_sequence.loc[developer_sequence['profile']=='pw']
ca = developer_sequence.loc[developer_sequence['profile']=='ca']
cw = developer_sequence.loc[developer_sequence['profile']=='cw']
core = developer_sequence.loc[developer_sequence['profile'].isin(['ca','cw'])]
peri = developer_sequence.loc[developer_sequence['profile'].isin(['pa','pw'])]

In [20]:
technical_importance_metric = [ 'max_period_centrality', 'max_centrality_period','max_commit_centrality', 'max_centrality_day']

In [None]:
# compare core and peripheral contributors
for fea in technical_importance_metric:
    x1, x2 = core[fea],peri[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

In [None]:
# compare core-afterhour and core-workhour contributors
for fea in technical_importance_metric:
    x1, x2 = ca[fea],pa[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

In [None]:
# compare peripheral-afterhour and peripheral-workhour contributors
for fea in technical_importance_metric:
    x1, x2 = pa[fea],pw[fea]
    stat, pval = mannwhitneyu(x1, x2)
    d, res = cliffs_delta(x1, x2)
    if pval < 0.05: # significantly different
        print(f'{fea} {res} ({round(d,3)})' )
    else:  # not significantly different
        print(f'{fea} not significant' )

# ML Components

In [46]:
labeled_components = pd.read_csv(os.path.join(basedir, 'ML_component_labels.csv'))
ca_label = labeled_components.loc[labeled_components['owner profile']=='ca']['label']
cw_label = labeled_components.loc[labeled_components['owner profile']=='cw']['label']
pa_label = labeled_components.loc[labeled_components['owner profile']=='pa']['label']
pw_label = labeled_components.loc[labeled_components['owner profile']=='pw']['label']
core_label = pd.concat([ca_label,cw_label])
peri_label = pd.concat([pa_label,pw_label])

In [None]:
ca_label.value_counts()/len(ca_label)

In [None]:
cw_label.value_counts()/len(cw_label)

In [None]:
pa_label.value_counts()/len(pa_label)

In [None]:
pw_label.value_counts()/len(pw_label)

In [None]:
# compare core and periphral contributors
x1 = core_label.value_counts().sort_index().to_list()
x2 = peri_label.value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

In [None]:
# compare ca and cw contributors
x1 = ca_label.value_counts().sort_index().to_list()
x2 = cw_label.value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)

In [None]:
# compare pa and pw contributors
x1 = pa_label.value_counts().sort_index().to_list()
x2 = pw_label.value_counts().sort_index().to_list()
contingency_table = [x1, x2]
chi2, p, dof, expected = chi2_contingency(contingency_table)
v = cramers_v(contingency_table, chi2)
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Cramer's V:", v)