In [1]:
import numpy as np
import pandas as pd
import os
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr

In [None]:
# install this package if you have not install it, otherwise donot run this cell
utils = importr('utils')
base = importr('base')
utils.chooseCRANmirror(ind=1)
utils.install_packages('randtests')

In [3]:
numpy2ri.activate()
randtests = importr("randtests")

## Read data

In [4]:
def string_to_list(col):
    if '.' in col:
        return [float(item) for item in col.strip('[]').split(',')]
    else:
        return [int(item) for item in col.strip('[]').split(',')]

In [5]:
basedir = os.path.join("..","data")
projects = ['tensorflow', 'pytorch','keras','mxnet','theano','onnx']
developer_data = pd.read_csv(os.path.join(basedir, 'contributor_features.csv'), index_col = 0)
filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)
developer_sequence = pd.read_csv(os.path.join(basedir,'contributor_activity_sequence.csv'), index_col=0)
for col_name in developer_sequence:
    if 'sequence' in col_name:
        developer_sequence[col_name] = developer_sequence[col_name].apply(string_to_list)

## cox stuart trend test

In [24]:
def cos_stuart(ts, alternative):
    if len(ts) < 2:
        return None
    # alternative: 'left.sided' or 'right.sided'
    result = randtests.cox_stuart_test(ts, alternative)
    p_value = result.rx2("p.value")[0]
    return p_value

In [58]:
def signf_code(chisq):
    if chisq <= 0.001:
        return '***'
    elif chisq <= 0.01:
        return '**'
    elif chisq <= 0.05:
        return '*'
    elif chisq <= 0.1:
        return '.'
    else:
        return ' '

### Evolution of workload composition

#### project-level trend

In [60]:
project_period_df = []
for i in range(len(projects)):
    project_df = filtered_developer_period_df.loc[filtered_developer_period_df['project']==projects[i]]
    periods = sorted(project_df['period'].unique())
    project_sequence_dict = {'project':projects[i]}
    for pattern in range(1,6):
        project_sequence_dict[f'pattern{pattern}_count'] = []
        project_sequence_dict[f'pattern{pattern}_ratio'] = []
    for p in periods:
        period_df = project_df.loc[project_df['period']==p]
        if len(period_df) > 0:
            for pattern in range(1,6):
                project_sequence_dict[f'pattern{pattern}_count'].append(len(period_df.loc[period_df['wcp_code']==pattern]))
                project_sequence_dict[f'pattern{pattern}_ratio'].append(len(period_df.loc[period_df['wcp_code']==pattern])/len(period_df))
        else:
            for pattern in range(1,6):
                project_sequence_dict[f'pattern{pattern}_count'].append(0)
                project_sequence_dict[f'pattern{pattern}_ratio'].append(0)

    project_period_df.append(project_sequence_dict)
project_period_df = pd.DataFrame.from_dict(project_period_df, orient='columns')

In [61]:
def cos_stuart_project_result(ts):
    down = cos_stuart(np.array(ts), 'left.sided')
    up = cos_stuart(np.array(ts), 'right.sided')
    if up > 0.05 and down > 0.05:
        return '-'
    elif up <= 0.05:
        return f'({signf_code(up)})↗'
    elif down <= 0.05:
        return f'({signf_code(down)})↘'
    

In [63]:
## overall trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(cos_stuart_project_result)
project_trend_df

Unnamed: 0,project,pattern1_count,pattern1_ratio,pattern2_count,pattern2_ratio,pattern3_count,pattern3_ratio,pattern4_count,pattern4_ratio,pattern5_count,pattern5_ratio
0,tensorflow,-,(***)↘,-,(***)↘,(***)↗,(***)↗,-,-,-,-
1,pytorch,-,(***)↘,(*)↗,(*)↗,(*)↗,-,(*)↗,-,(***)↗,(***)↗
2,keras,(***)↘,(***)↘,(**)↘,-,-,-,(*)↘,-,-,-
3,mxnet,(**)↘,(***)↘,-,-,-,-,-,(**)↗,(*)↗,(***)↗
4,theano,(*)↘,(*)↘,(**)↘,-,-,-,(*)↘,(*)↘,(*)↗,(*)↗
5,onnx,-,-,(**)↘,-,-,-,-,(*)↗,-,-


In [68]:
## early to middle stage trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(lambda x: cos_stuart_project_result(x[:int(len(x)/1.5)]))
project_trend_df

Unnamed: 0,project,pattern1_count,pattern1_ratio,pattern2_count,pattern2_ratio,pattern3_count,pattern3_ratio,pattern4_count,pattern4_ratio,pattern5_count,pattern5_ratio
0,tensorflow,-,(**)↘,-,-,(**)↗,(**)↗,(**)↗,-,(*)↗,(*)↗
1,pytorch,-,(**)↘,-,-,-,-,(**)↗,(**)↗,(**)↗,(**)↗
2,keras,(*)↘,(**)↘,-,-,-,-,-,(*)↗,-,(*)↗
3,mxnet,-,(*)↘,(**)↗,(**)↗,-,-,(**)↗,(*)↗,(**)↗,(**)↗
4,theano,(**)↗,-,(*)↗,-,(**)↗,-,(*)↗,-,(**)↗,(**)↗
5,onnx,-,-,-,-,-,-,-,-,-,(*)↗


In [69]:
## middle to late stage trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(lambda x: cos_stuart_project_result(x[int(len(x)/3):]))
project_trend_df

Unnamed: 0,project,pattern1_count,pattern1_ratio,pattern2_count,pattern2_ratio,pattern3_count,pattern3_ratio,pattern4_count,pattern4_ratio,pattern5_count,pattern5_ratio
0,tensorflow,(**)↘,(**)↘,(*)↘,(*)↘,-,(**)↗,-,-,-,(*)↘
1,pytorch,-,-,-,-,-,-,-,-,(**)↗,(**)↗
2,keras,-,-,(*)↘,-,-,-,(**)↘,-,-,-
3,mxnet,(*)↘,-,(**)↘,-,(**)↘,(*)↘,-,(*)↗,-,(**)↗
4,theano,(***)↘,(***)↘,(***)↘,-,(***)↘,-,(***)↘,-,-,-
5,onnx,-,-,-,-,-,-,-,-,(*)↘,-


### workload composition pattern trend

In [74]:
def cos_stuart_developer_result(ts):
    down = cos_stuart(np.array(ts), 'left.sided')
    up = cos_stuart(np.array(ts), 'right.sided')
    if up > 0.05 and down > 0.05:
        return '-'
    elif up <= 0.05:
        return '↗'
    elif down <= 0.05:
        return '↘'

In [114]:
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [115]:
# overall trend
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=2)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(cos_stuart_developer_result)
rslt = []
for profile in developer_overall_trend_df:
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df),1)}%)')
developer_overall_trend_df.loc['workload_sequence'] =rslt
developer_overall_trend_df

Unnamed: 0,ca,cw,pa,pw
workload_sequence,↗(1.8%) ↘(0.4%),↗(1.6%) ↘(0.4%),↗(0.2%) ↘(0.0%),↗(0.2%) ↘(0.0%)


In [116]:
# early to middle trend
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=3)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(lambda x:cos_stuart_developer_result(x[:int(len(x)/1.5)]))
rslt = []
for profile in developer_overall_trend_df:
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df),1)}%)')
developer_early_trend_df.loc['workload_sequence'] =rslt
developer_early_trend_df

Unnamed: 0,ca,cw,pa,pw
workload_sequence,↗(0.9%) ↘(0.1%),↗(0.7%) ↘(0.1%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)


In [105]:
# middle to late trend
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=3)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(lambda x:cos_stuart_developer_result(x[int(len(x)/3):]))
rslt = []
for profile in developer_overall_trend_df:
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df),1)}%)')
developer_late_trend_df.loc['workload_sequence'] =rslt
developer_late_trend_df

Unnamed: 0,ca,cw,pa,pw
workload_sequence,↗(0.2%) ↘(0.0%),↗(0.3%) ↘(0.2%),↗(0.0%) ↘(0.0%),↗(0.1%) ↘(0.0%)


## Evolution of work preference

In [124]:
work_preference_features = ['binned_entropy', 'c3(1)', 'c3(2)', 'c3(3)', 'number_cwt_peaks', 'longest_strike_above_mean', 
                             'longest_strike_below_mean', 'diverse', 'balance', 'commit', 'issue', 'issue comment', 'pr comment', 
                             'review']
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [130]:
# overall trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 2 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=2)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(cos_stuart_developer_result)
    rslt = []
    for profile in developer_overall_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_overall_trend_df.loc[fea] =rslt
developer_overall_trend_df

Unnamed: 0,ca,cw,pa,pw
binned_entropy,↗(1.0%) ↘(4.2%),↗(0.8%) ↘(2.7%),↗(0.0%) ↘(0.2%),↗(0.1%) ↘(0.3%)
c3(1),↗(0.5%) ↘(3.9%),↗(1.3%) ↘(2.3%),↗(0.0%) ↘(0.1%),↗(0.0%) ↘(0.2%)
c3(2),↗(0.6%) ↘(3.2%),↗(0.8%) ↘(2.0%),↗(0.1%) ↘(0.1%),↗(0.0%) ↘(0.1%)
c3(3),↗(0.8%) ↘(2.7%),↗(1.0%) ↘(2.0%),↗(0.0%) ↘(0.1%),↗(0.0%) ↘(0.1%)
number_cwt_peaks,↗(0.6%) ↘(3.9%),↗(0.7%) ↘(3.0%),↗(0.0%) ↘(0.1%),↗(0.1%) ↘(0.4%)
longest_strike_above_mean,↗(0.6%) ↘(3.5%),↗(1.0%) ↘(2.4%),↗(0.0%) ↘(0.1%),↗(0.0%) ↘(0.4%)
longest_strike_below_mean,↗(2.5%) ↘(1.0%),↗(2.3%) ↘(1.1%),↗(0.2%) ↘(0.1%),↗(0.2%) ↘(0.2%)
diverse,↗(0.4%) ↘(2.3%),↗(0.9%) ↘(2.4%),↗(0.1%) ↘(0.2%),↗(0.0%) ↘(0.2%)
balance,↗(4.1%) ↘(1.1%),↗(2.7%) ↘(1.2%),↗(0.1%) ↘(0.1%),↗(0.2%) ↘(0.0%)
commit,↗(1.0%) ↘(4.6%),↗(1.2%) ↘(4.1%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)


In [131]:
# early to middle trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[:int(len(x)/1.5)]))
    rslt = []
    for profile in developer_early_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_early_trend_df.loc[fea] =rslt
developer_early_trend_df

Unnamed: 0,ca,cw,pa,pw
binned_entropy,↗(1.6%) ↘(1.0%),↗(0.9%) ↘(0.5%),↗(0.2%) ↘(0.0%),↗(0.1%) ↘(0.0%)
c3(1),↗(0.9%) ↘(0.9%),↗(0.8%) ↘(0.6%),↗(0.2%) ↘(0.0%),↗(0.0%) ↘(0.0%)
c3(2),↗(0.9%) ↘(0.7%),↗(0.8%) ↘(0.6%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
c3(3),↗(1.0%) ↘(0.4%),↗(1.0%) ↘(0.3%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
number_cwt_peaks,↗(0.7%) ↘(0.7%),↗(0.9%) ↘(0.5%),↗(0.0%) ↘(0.0%),↗(0.1%) ↘(0.0%)
longest_strike_above_mean,↗(0.7%) ↘(0.3%),↗(0.5%) ↘(0.6%),↗(0.0%) ↘(0.0%),↗(0.1%) ↘(0.0%)
longest_strike_below_mean,↗(0.6%) ↘(1.8%),↗(0.1%) ↘(0.7%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.1%)
diverse,↗(0.6%) ↘(0.4%),↗(0.8%) ↘(0.1%),↗(0.0%) ↘(0.0%),↗(0.1%) ↘(0.0%)
balance,↗(1.1%) ↘(1.0%),↗(0.7%) ↘(1.1%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
commit,↗(1.0%) ↘(1.1%),↗(0.3%) ↘(1.3%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)


In [132]:
# middle to late trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[int(len(x)/3):]))
    rslt = []
    for profile in developer_late_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_late_trend_df.loc[fea] =rslt
developer_late_trend_df

Unnamed: 0,ca,cw,pa,pw
binned_entropy,↗(0.1%) ↘(4.2%),↗(0.2%) ↘(3.3%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.1%)
c3(1),↗(0.6%) ↘(3.5%),↗(0.3%) ↘(3.3%),↗(0.0%) ↘(0.2%),↗(0.0%) ↘(0.0%)
c3(2),↗(0.3%) ↘(2.9%),↗(0.2%) ↘(2.6%),↗(0.0%) ↘(0.2%),↗(0.0%) ↘(0.0%)
c3(3),↗(0.2%) ↘(3.0%),↗(0.2%) ↘(2.6%),↗(0.0%) ↘(0.2%),↗(0.0%) ↘(0.0%)
number_cwt_peaks,↗(0.1%) ↘(3.3%),↗(0.1%) ↘(2.6%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
longest_strike_above_mean,↗(0.4%) ↘(2.1%),↗(0.2%) ↘(1.8%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.1%)
longest_strike_below_mean,↗(2.4%) ↘(0.1%),↗(2.1%) ↘(0.0%),↗(0.5%) ↘(0.0%),↗(0.4%) ↘(0.0%)
diverse,↗(0.0%) ↘(2.8%),↗(0.5%) ↘(1.5%),↗(0.0%) ↘(0.2%),↗(0.0%) ↘(0.0%)
balance,↗(4.3%) ↘(0.2%),↗(2.7%) ↘(1.0%),↗(0.2%) ↘(0.2%),↗(0.3%) ↘(0.0%)
commit,↗(0.2%) ↘(3.5%),↗(0.2%) ↘(2.5%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)


### technical importance

In [142]:
centrality_sequence_features = ['per commit centrality', 'period commit centrality']
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [148]:
# overall trend
for fea in centrality_sequence_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 2 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=2)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(cos_stuart_developer_result)
    rslt = []
    for profile in developer_overall_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_overall_trend_df.loc[fea] =rslt
developer_overall_trend_df

Unnamed: 0,ca,cw,pa,pw
per commit centrality,↗(8.1%) ↘(10.6%),↗(6.1%) ↘(9.6%),↗(0.0%) ↘(0.1%),↗(0.1%) ↘(0.4%)
period commit centrality,↗(1.5%) ↘(4.7%),↗(0.8%) ↘(4.8%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.1%)


In [149]:
# early to middle trend
for fea in centrality_sequence_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[:int(len(x)/1.5)]))
    rslt = []
    for profile in developer_early_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_early_trend_df.loc[fea] =rslt
developer_early_trend_df

Unnamed: 0,ca,cw,pa,pw
per commit centrality,↗(6.7%) ↘(8.9%),↗(3.5%) ↘(8.1%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
period commit centrality,↗(0.8%) ↘(1.0%),↗(0.1%) ↘(0.9%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)


In [150]:
# middle to late trend
for fea in centrality_sequence_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[int(len(x)/3):]))
    rslt = []
    for profile in developer_late_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_late_trend_df.loc[fea] =rslt
developer_late_trend_df

Unnamed: 0,ca,cw,pa,pw
per commit centrality,↗(5.8%) ↘(7.6%),↗(5.4%) ↘(7.6%),↗(0.2%) ↘(0.0%),↗(0.0%) ↘(0.2%)
period commit centrality,↗(0.4%) ↘(2.1%),↗(0.1%) ↘(2.1%),↗(0.0%) ↘(0.0%),↗(0.0%) ↘(0.0%)
