In [None]:
import numpy as np
import pandas as pd
import os
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr

In [None]:
# install this package if you have not install it, otherwise donot run this cell
utils = importr('utils')
base = importr('base')
utils.chooseCRANmirror(ind=1)
utils.install_packages('randtests')

In [None]:
numpy2ri.activate()
randtests = importr("randtests")

## Read data

In [None]:
def string_to_list(col):
    if '.' in col:
        return [float(item) for item in col.strip('[]').split(',')]
    else:
        return [int(item) for item in col.strip('[]').split(',')]

In [None]:
basedir = os.path.join("..","data")
projects = ['tensorflow', 'pytorch', 'scikit-learn', 'keras', 'mxnet', 'theano_aesara', 'onnx', 'deeplearning4j']
developer_data = pd.read_csv(os.path.join(basedir, 'contributor_features.csv'), index_col = 0)
filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)
developer_sequence = pd.read_csv(os.path.join(basedir,'contributor_activity_sequence.csv'), index_col=0)
for col_name in developer_sequence:
    if 'sequence' in col_name:
        developer_sequence[col_name] = developer_sequence[col_name].apply(string_to_list)

## cox stuart trend test

In [None]:
def cos_stuart(ts, alternative):
    if len(ts) < 2:
        return None
    # alternative: 'left.sided' or 'right.sided'
    result = randtests.cox_stuart_test(ts, alternative)
    print(result)
    p_value = result.rx2("p.value")[0]
    return p_value

In [None]:
ts = [1,2,3,4,5,6,7,8,9,10, 10000]
cos_stuart(np.array(ts), 'right.sided')

In [None]:
def signf_code(chisq):
    if chisq <= 0.001:
        return '***'
    elif chisq <= 0.01:
        return '**'
    elif chisq <= 0.05:
        return '*'
    elif chisq <= 0.1:
        return '.'
    else:
        return ' '

### Evolution of workload composition

#### project-level trend

In [None]:
project_period_df = []
for i in range(len(projects)):
    project_df = filtered_developer_period_df.loc[filtered_developer_period_df['project']==projects[i]]
    periods = sorted(project_df['period'].unique())
    project_sequence_dict = {'project':projects[i]}
    for pattern in range(1,6):
        project_sequence_dict[f'pattern{pattern}_count'] = []
        project_sequence_dict[f'pattern{pattern}_ratio'] = []
    for p in periods:
        period_df = project_df.loc[project_df['period']==p]
        if len(period_df) > 0:
            for pattern in range(1,6):
                project_sequence_dict[f'pattern{pattern}_count'].append(len(period_df.loc[period_df['wcp_code']==pattern]))
                project_sequence_dict[f'pattern{pattern}_ratio'].append(len(period_df.loc[period_df['wcp_code']==pattern])/len(period_df))
        else:
            for pattern in range(1,6):
                project_sequence_dict[f'pattern{pattern}_count'].append(0)
                project_sequence_dict[f'pattern{pattern}_ratio'].append(0)

    project_period_df.append(project_sequence_dict)
project_period_df = pd.DataFrame.from_dict(project_period_df, orient='columns')

In [None]:
def cos_stuart_project_result(ts):
    down = cos_stuart(np.array(ts), 'left.sided')
    up = cos_stuart(np.array(ts), 'right.sided')
    if up > 0.05 and down > 0.05:
        return '-'
    elif up <= 0.05:
        return f'({signf_code(up)})↗'
    elif down <= 0.05:
        return f'({signf_code(down)})↘'
    

In [None]:
## overall trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(cos_stuart_project_result)
project_trend_df

In [None]:
## early to middle stage trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(lambda x: cos_stuart_project_result(x[:int(len(x)/1.5)]))
project_trend_df

In [None]:
## middle to late stage trend
project_trend_df = project_period_df.copy(deep=True)
for col in project_trend_df.columns[1:]:
    project_trend_df[col] =  project_trend_df[col].apply(lambda x: cos_stuart_project_result(x[int(len(x)/3):]))
project_trend_df

### workload composition pattern trend

In [None]:
def cos_stuart_developer_result(ts):
    down = cos_stuart(np.array(ts), 'left.sided')
    up = cos_stuart(np.array(ts), 'right.sided')
    if up > 0.05 and down > 0.05:
        return '-'
    elif up <= 0.05:
        return '↗'
    elif down <= 0.05:
        return '↘'

In [None]:
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [None]:
# overall trend
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=2)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(cos_stuart_developer_result)
rslt = []
for profile in developer_overall_trend_df:
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df),1)}%)')
developer_overall_trend_df.loc['workload_sequence'] =rslt
developer_overall_trend_df

In [None]:
# early to middle trend
wcp_trend_df1 = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=3)][['profile','workload_sequence']]
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=10)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(lambda x:cos_stuart_developer_result(x[:10]))
rslt = []
for profile in developer_overall_trend_df:
    df1 = wcp_trend_df1.loc[wcp_trend_df1['profile']==profile]
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df1),1)}%)')
developer_early_trend_df.loc['workload_sequence'] =rslt
developer_early_trend_df

In [None]:
# middle to late trend
wcp_trend_df1 = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=3)][['profile','workload_sequence']]
wcp_trend_df = developer_sequence.loc[developer_sequence['workload_sequence'].apply(lambda x:len(x)>=12)][['profile','workload_sequence']]
wcp_trend_df['workload_sequence']= wcp_trend_df['workload_sequence'].apply(lambda x:cos_stuart_developer_result(x[10:]))
rslt = []
for profile in developer_overall_trend_df:
    df1 = wcp_trend_df1.loc[wcp_trend_df1['profile']==profile]
    df = wcp_trend_df.loc[wcp_trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df["workload_sequence"]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df["workload_sequence"]=="↘"])/len(df1),1)}%)')
developer_late_trend_df.loc['workload_sequence'] =rslt
developer_late_trend_df

## Evolution of work preference

In [None]:
work_preference_features = ['binned_entropy', 'c3(1)', 'c3(2)', 'c3(3)', 'number_cwt_peaks', 'longest_strike_above_mean', 
                             'longest_strike_below_mean', 'diverse', 'balance', 'commit', 'issue', 'issue comment', 'pr comment', 
                             'review']
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [None]:
# overall trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 2 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=2)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(cos_stuart_developer_result)
    rslt = []
    for profile in developer_overall_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_overall_trend_df.loc[fea] =rslt
developer_overall_trend_df

In [None]:
# early to middle trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=10)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[:10]))
    rslt = []
    for profile in developer_early_trend_df:
        df1 = trend_df1.loc[trend_df1['profile']==profile]
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
    developer_early_trend_df.loc[fea] =rslt
developer_early_trend_df

In [None]:
# middle to late trend
for fea in work_preference_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 3 periods
    trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=12)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[10:]))
    rslt = []
    for profile in developer_late_trend_df:
        df1 = trend_df1.loc[trend_df1['profile']==profile]
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
    developer_late_trend_df.loc[fea] =rslt
developer_late_trend_df

### technical importance

In [None]:
centrality_sequence_features = ['per commit centrality', 'period commit centrality']
developer_overall_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_early_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])
developer_late_trend_df = pd.DataFrame(columns=['ca','cw','pa','pw'])

In [None]:
# overall trend
for fea in centrality_sequence_features:
    temp_fea = fea+"_sequence" if not fea in developer_sequence else fea
    # remove developer with less than 2 periods
    trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=2)][['profile',temp_fea]]
    trend_df[temp_fea]= trend_df[temp_fea].apply(cos_stuart_developer_result)
    rslt = []
    for profile in developer_overall_trend_df:
        df = trend_df.loc[trend_df['profile']==profile]
        rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df),1)}%)')
    developer_overall_trend_df.loc[fea] =rslt
developer_overall_trend_df

In [None]:
# early to middle trend:
temp_fea = "per commit centrality_sequence"
# remove developer with less than 3 commits
trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=40)][['profile',temp_fea]]
trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[:40]))
rslt = []
for profile in developer_early_trend_df:
    df1 = trend_df1.loc[trend_df1['profile']==profile]
    df = trend_df.loc[trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
developer_early_trend_df.loc[temp_fea] =rslt
developer_early_trend_df

In [None]:
# early to middle trend:
temp_fea = "period commit centrality_sequence"
# remove developer with less than 3 periods
trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=10)][['profile',temp_fea]]
trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[:10]))
rslt = []
for profile in developer_early_trend_df:
    df1 = trend_df1.loc[trend_df1['profile']==profile]
    df = trend_df.loc[trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
developer_early_trend_df.loc[temp_fea] =rslt
developer_early_trend_df

In [None]:
# middle to late trend
temp_fea = "per commit centrality_sequence"
# remove developer with less than 3 periods
trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=42)][['profile',temp_fea]]
trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[40:]))
rslt = []
for profile in developer_late_trend_df:
    df1 = trend_df1.loc[trend_df1['profile']==profile]
    df = trend_df.loc[trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
developer_late_trend_df.loc[temp_fea] =rslt
developer_late_trend_df

In [None]:
# middle to late trend
temp_fea = "period commit centrality_sequence"
# remove developer with less than 3 periods
trend_df1 = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=3)][['profile',temp_fea]]
trend_df = developer_sequence.loc[developer_sequence[temp_fea].apply(lambda x:len(x)>=12)][['profile',temp_fea]]
trend_df[temp_fea]= trend_df[temp_fea].apply(lambda x: cos_stuart_developer_result(x[10:]))
rslt = []
for profile in developer_late_trend_df:
    df1 = trend_df1.loc[trend_df1['profile']==profile]
    df = trend_df.loc[trend_df['profile']==profile]
    rslt.append(f'↗({round(100*len(df.loc[df[temp_fea]=="↗"])/len(df1),1)}%) ↘({round(100*len(df.loc[df[temp_fea]=="↘"])/len(df1),1)}%)')
developer_late_trend_df.loc[temp_fea] =rslt
developer_late_trend_df