In [2]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import datetime
from itertools import chain
import re
import networkx as nx
import matplotlib.pyplot as plt
from scipy import stats
import math
from tabulate import tabulate
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from tsfresh import extract_features
from tsfresh.feature_extraction import feature_calculators as fc

## Read data

In [3]:
def date_handler(date_str):
    return datetime.datetime.fromisoformat(date_str.split()[0])
def get_timezone(date_time_str):
    print(date_time_str)
    ts = time.mktime(datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S').timetuple())
    utc_offset = datetime.datetime.fromtimestamp(ts) - datetime.datetime.utcfromtimestamp(ts)
    return utc_offset
def date_time_handler(date_time_str):
    return datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')

In [11]:
basedir = os.path.join(os.getcwd(), '..','data')
proj_names = ['tensorflow_tensorflow',
              'pytorch_pytorch',
                'scikit-learn_scikit-learn',
                'keras-team_keras',
                'apache_mxnet',
                'theano_aesara',
                'onnx_onnx',
                'deeplearning4j_deeplearning4j']

repo_dirs = [os.path.join(basedir,item) for item in proj_names]

projects = ['tensorflow', 'pytorch', 'scikit-learn', 'keras', 'mxnet', 'theano_aesara', 'onnx', 'deeplearning4j']

### read contributor features

In [5]:
developer_data = pd.read_csv(os.path.join(basedir, "contributor_features.csv"), index_col=0)

In [7]:
# studied period of the projects
start = datetime.datetime(2008,1,1)
end = datetime.datetime(2022,4,30)

### read contributor features

In [8]:
developer_data = pd.read_csv(os.path.join(basedir, "contributor_features.csv"), index_col=0)

### read commits

In [None]:
project_commit_df = []
for proj_dir in repo_dirs:
    commit_df = pd.read_csv(os.path.join(proj_dir,'commit_main.csv'), index_col=0)
    commit_df['Time'] = pd.to_datetime(commit_df['Time'])
    commit_df['Changed Files'] = commit_df['Changed Files'].apply(lambda x: x.split() if isinstance(x, str) else [])
    commit_df['Parents'] = commit_df['Parents'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_commit_df.append(commit_df)
for df in project_commit_df:
    df.set_index('Commit#', inplace=True)
project_commit_dict = [df.to_dict(orient='index') for df in project_commit_df ]

### read pr and issue

In [9]:
project_pr_df = []
for proj_dir in repo_dirs:    
    pr_df = pd.read_csv(os.path.join(proj_dir,'pull_request_main.csv'), index_col=0)
    pr_df['Opened time'] = pd.to_datetime(pr_df['Opened time'])
    pr_df['Closed time'] = pd.to_datetime(pr_df['Closed time'])
    pr_df['Labels'] = pr_df['Labels'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Assignees'] = pr_df['Assignees'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Reviewers'] = pr_df['Reviewers'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Participants'] = pr_df['Participants'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Commits'] = pr_df['Commits'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_pr_df.append(pr_df)
project_pr_dict = [df.to_dict(orient='index') for df in project_pr_df ]

In [None]:
project_issue_df = []
for proj_dir in repo_dirs:  
    issue_df = pd.read_csv(os.path.join(proj_dir,'issue_main.csv'), index_col=0)
    issue_df['Opened time'] = pd.to_datetime(issue_df['Opened time'])
    issue_df['Closed time'] = pd.to_datetime(issue_df['Closed time'])
    issue_df['Labels'] = issue_df['Labels'].apply(lambda x: x.split() if isinstance(x, str) else [])
    issue_df['Assignees'] = issue_df['Assignees'].apply(lambda x: x.split() if isinstance(x, str) else [])
    issue_df['Participants'] = issue_df['Participants'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_issue_df.append(issue_df)
project_issue_dict = [df.to_dict(orient='index') for df in project_issue_df ]

### read pr and issue comments

In [10]:
pr_comments_dict = []
for proj_dir in repo_dirs:  
    with open(os.path.join(proj_dir, 'pr_comment_time.json'), 'r') as json_file:
        pr_comment_dict = json.load(json_file)
        pr_comment_dict = {key: list(map(lambda x: date_time_handler(x), value)) for key, value in pr_comment_dict.items()}
        pr_comments_dict.append(pr_comment_dict)

In [11]:
issue_comments_dict = []
for proj_dir in repo_dirs:  
    with open(os.path.join(proj_dir, 'issue_comment_time.json'), 'r') as json_file:
        issue_comment_dict = json.load(json_file)
        issue_comment_dict = {key: list(map(lambda x: date_time_handler(x), value)) for key, value in issue_comment_dict.items()}
        issue_comments_dict.append(issue_comment_dict)

## Generate contributor period OSS activities (build workload composition vector space)

In [16]:
review_dict = []
for i in range(len(projects)):
    proj_dict = {}
    for idx in project_pr_dict[i]:
        if len(project_pr_dict[i][idx]['Reviewers']) > 0:
            for reviewer in project_pr_dict[i][idx]['Reviewers']:
                if reviewer in proj_dict:
                    proj_dict[reviewer].append(project_pr_dict[i][idx]['Opened time'])
                else:
                    proj_dict[reviewer] = [project_pr_dict[i][idx]['Opened time']]
    review_dict.append(proj_dict)
    
commit_dict = []
for i in range(len(projects)):
    proj_dict = project_commit_df[i].groupby(['Author'])['Time'].apply(list).to_dict()
    commit_dict.append(proj_dict)

issue_dict = []
for i in range(len(projects)):
    proj_dict = project_issue_df[i].groupby(['Owner'])['Opened time'].apply(list).to_dict()
    issue_dict.append(proj_dict)

activity_dict = []
for i in range(len(projects)):
    proj_dict = {}
    for dev in developer_data.loc[developer_data['project']==projects[i]]['name'].to_list():
        proj_dict[dev] = []
        if dev in issue_dict[i]:
            l = list(map(lambda x:('issue',x), issue_dict[i][dev]))
            proj_dict[dev].extend(l)
        if dev in review_dict[i]:
            l = list(map(lambda x:('review',x), review_dict[i][dev]))
            proj_dict[dev].extend(l)
        if dev in issue_comments_dict[i]:
            l = list(map(lambda x:('issue comment',x), issue_comments_dict[i][dev]))
            proj_dict[dev].extend(l)
        if dev in pr_comments_dict[i]:
            l = list(map(lambda x:('pr comment',x), pr_comments_dict[i][dev]))
            proj_dict[dev].extend(l)
        if dev in commit_dict[i]:
            l = list(map(lambda x:('commit',x), commit_dict[i][dev]))
            proj_dict[dev].extend(l)
        proj_dict[dev] = sorted(proj_dict[dev], key = lambda x:x[1])
    activity_dict.append(proj_dict)

### Get 90-days periods

In [17]:
project_duration = []
end = datetime.datetime(2024,1,1)
for i in range(len(projects)):
    project_duration.append({'start':min(project_commit_df[i]['Time'].min(),project_issue_df[i]['Opened time'].min(),project_pr_df[i]['Opened time'].min()),
                            'end':end})

In [18]:
num_days = 90
project_stage_df = []
for i in range(len(projects)):
    df = pd.DataFrame()
    bin_dates = [[],[]]
    cur = project_duration[i]['start']
    delta = datetime.timedelta(days=num_days)
    while cur+delta < project_duration[i]['end']:
        bin_dates[0].append(cur)
        bin_dates[1].append(cur+delta)
        cur += delta
    df['start'] = bin_dates[0]
    df['end'] = bin_dates[1]
    project_stage_df.append(df)

In [None]:
for i in range(len(projects)):
    star_df = pd.read_csv(os.path.join(repo_dirs[i],'star_history.csv'))
    star_df['starredAt'] = pd.to_datetime(star_df['starredAt'])
    fork_df = pd.read_csv(os.path.join(repo_dirs[i],'fork_history.csv'))
    fork_df['time'] = pd.to_datetime(fork_df['time'])

    # Add a column to df1 that counts how many dates in df2 fall within the start and end period
    project_stage_df[i]['star'] = project_stage_df[i].apply(lambda row: ((star_df['starredAt'] >= row['start']) & (star_df['starredAt'] < row['end'])).sum(), axis=1)
    project_stage_df[i]['fork'] = project_stage_df[i].apply(lambda row: ((fork_df['time'] >= row['start']) & (fork_df['time'] < row['end'])).sum(), axis=1)

### Get developer OSS activities per period

In [19]:
def get_developer_period_commit(i,x):
    developer_commits = project_commit_df[i].loc[(project_commit_df[i]['Time']>= x['start'])&(project_commit_df[i]['Time']< x['end'])].groupby(['Author']).size().to_dict()
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    filtered = {}
    for d in developers:
        if d in developer_commits:
            filtered[d] = developer_commits[d]
        else:
            filtered[d] = 0
    return filtered
def get_developer_period_issue(i,x):
    developer_issues = project_issue_df[i].loc[(project_issue_df[i]['Opened time']>= x['start'])&(project_issue_df[i]['Opened time']< x['end'])].groupby(['Owner'])['Opened time'].count().to_dict()
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    filtered = {}
    for d in developers:
        if d in developer_issues:
            filtered[d] = developer_issues[d]
        else:
            filtered[d] = 0
    return filtered

def get_developer_period_issue_comment(i,x):
    issue_comments = {}
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    for d in developers:
        if d in issue_comments_dict[i]:
            issue_comments[d] = len(list(filter(lambda day: day>= x['start'] and day<x['end'], issue_comments_dict[i][d])))
        else:
            issue_comments[d] = 0
    return issue_comments

def get_developer_period_pr_comment(i,x):
    pr_comments = {}
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    for d in developers:
        if d in pr_comments_dict[i]:
            pr_comments[d] = len(list(filter(lambda day: day>= x['start'] and day<x['end'], pr_comments_dict[i][d])))
        else:
            pr_comments[d] = 0
    return pr_comments
def get_developer_period_review(i,x):
    review = {}
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    for d in developers:
        if d in review_dict[i]:
            review[d] = len(list(filter(lambda day: day>= x['start'] and day<x['end'], review_dict[i][d])))
        else:
            review[d] = 0
    return review

In [20]:
developer_data['join date'] = developer_data['join date'].apply(date_time_handler)
developer_data['last active'] = developer_data['last active'].apply(date_time_handler)

In [21]:
features = ['commit','issue', 'issue comment','pr comment',  'review']

In [22]:
# This might take around 5 min
developer_period_df = []
for i in range(len(projects)):
    developers = developer_data.loc[developer_data['project']==projects[i]]['name'].to_list()
    profiles = developer_data.loc[developer_data['project']==projects[i]]['profile'].to_list()
    proj_period_df = []
    for d in developers:
        for index, row in project_stage_df[i].iterrows():
            proj_period_df.append([d,projects[i],profiles[developers.index(d)],index,row['start'], row['end']])
    proj_period_df = pd.DataFrame(proj_period_df, columns=['name', 'project','profile','period', 'start','end'])
    temp_commit = project_stage_df[i].apply(lambda x:get_developer_period_commit(i,x), axis=1)
    temp_issue = project_stage_df[i].apply(lambda x:get_developer_period_issue(i,x), axis=1)
    temp_issue_comment = project_stage_df[i].apply(lambda x:get_developer_period_issue_comment(i,x), axis=1)
    temp_pr_comment = project_stage_df[i].apply(lambda x:get_developer_period_pr_comment(i,x), axis=1)
    temp_review = project_stage_df[i].apply(lambda x:get_developer_period_review(i,x), axis=1)
    for p in project_stage_df[i].index:
        proj_period_df.loc[proj_period_df['period'] == p, 'commit'] = list(temp_commit[p].values())
        proj_period_df.loc[proj_period_df['period'] == p, 'issue'] = list(temp_issue[p].values())
        proj_period_df.loc[proj_period_df['period'] == p, 'issue comment'] = list(temp_issue_comment[p].values())
        proj_period_df.loc[proj_period_df['period'] == p, 'pr comment'] = list(temp_pr_comment[p].values())
        proj_period_df.loc[proj_period_df['period'] == p, 'review'] = list(temp_review[p].values())
    developer_period_df.append(proj_period_df)

In [23]:
# Merged the contributor activity dataframe for all projects
# Remove the initial periods for pytorch and theano where the projects might not be public yet (fork=0)
merged_developer_period_df = pd.concat(developer_period_df, ignore_index=True)
filtered_developer_period_df = merged_developer_period_df.loc[merged_developer_period_df[features].sum(axis=1) > 0]
filtered_developer_period_df = filtered_developer_period_df.loc[~((filtered_developer_period_df['project'] =='pytorch') & (filtered_developer_period_df['period'].between(0, 19)))]
filtered_developer_period_df = filtered_developer_period_df.loc[~((filtered_developer_period_df['project'] =='scikit-learn') & (filtered_developer_period_df['period'].between(0, 1)))]
filtered_developer_period_df = filtered_developer_period_df.loc[~((filtered_developer_period_df['project'] =='mxnet') & (filtered_developer_period_df['period']==0))]
filtered_developer_period_df = filtered_developer_period_df.loc[~((filtered_developer_period_df['project'] =='theano_aesara') & (filtered_developer_period_df['period'].between(0, 13)))]
filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='pytorch', 'period'] = filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='pytorch', 'period'].apply(lambda x:x-20)
filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='scikit-learn', 'period'] = filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='scikit-learn', 'period'].apply(lambda x:x-2)
filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='mxnet', 'period'] = filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='mxnet', 'period'].apply(lambda x:x-1)
filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='theano_aesara', 'period'] = filtered_developer_period_df.loc[filtered_developer_period_df['project'] =='theano_aesara', 'period'].apply(lambda x:x-14)

### Normalize the OSS activities

In [24]:
normalized_feature = ['commit_norm', 'issue_norm', 'issue_comment_norm', 'pr_comment_norm', 'review_norm']

In [None]:
def min_max_normalize(column):
    col_min = column.min()
    col_max = column.max()
    if col_min == col_max:
        return 0 if col_min == 0 else column / col_min
    else:
        return (column - col_min) / (col_max - col_min)

for i in range(len(projects)):
    proj = projects[i]
    periods = filtered_developer_period_df.loc[filtered_developer_period_df['project']==proj]['period'].value_counts().index
    for p in periods:
        idxs = filtered_developer_period_df.loc[(filtered_developer_period_df['project']==proj)&(filtered_developer_period_df['period']==p)].index
        filtered_developer_period_df.loc[idxs,'commit_norm'] = min_max_normalize(filtered_developer_period_df.loc[idxs]['commit'] )
        filtered_developer_period_df.loc[idxs,'issue_norm'] = min_max_normalize(filtered_developer_period_df.loc[idxs]['issue'] )
        filtered_developer_period_df.loc[idxs,'issue_comment_norm'] = min_max_normalize(filtered_developer_period_df.loc[idxs]['issue comment'] )
        filtered_developer_period_df.loc[idxs,'pr_comment_norm'] = min_max_normalize(filtered_developer_period_df.loc[idxs]['pr comment'] )
        filtered_developer_period_df.loc[idxs,'review_norm'] = min_max_normalize(filtered_developer_period_df.loc[idxs]['review'] )

In [None]:
zero_variance = filtered_developer_period_df[normalized_feature].std() == 0
print(f"Columns with zero variance: {zero_variance[zero_variance].index.tolist()}")

In [None]:
print('number of zero vectors: ',len(filtered_developer_period_df.loc[filtered_developer_period_df[normalized_feature].sum(axis=1) <= 0]))
filtered_developer_period_df = filtered_developer_period_df.loc[filtered_developer_period_df[normalized_feature].sum(axis=1) > 0]

## Identify workload composition pattern

In [None]:
# ## read from existing filtered_developer_period_df
# filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)

### Compute cosine similarity matrix

In [None]:
f = normalized_feature
cosine_sim_matrix = cosine_similarity(filtered_developer_period_df[f])
cosine_dist_matrix = 1-cosine_sim_matrix
cosine_dist_matrix

### Hierarchical Clustering - Gradient Search

In [None]:
#threshold = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.75, 0.8,0.85,0.9,0.95,0.96,0.97,0.98,0.99]
threshold = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.75, 0.8,0.85,0.9,0.95,0.96,0.97,0.98,0.99]
sil = []
num_clusters = []
distance =linkage(filtered_developer_period_df[f],"complete", metric="cosine")
for th in threshold:
    labels =fcluster(distance, th, criterion="distance")
    sil.append(silhouette_score(cosine_dist_matrix, labels))
    num_clusters.append(len(np.unique(labels)))

In [None]:
sns.lineplot(x=num_clusters,y=sil)

### Hierarchical Clustering

In [29]:
distance =linkage(filtered_developer_period_df[f],"complete", metric="cosine")
clst5 = fcluster(distance, 0.9999, criterion="distance")

In [None]:
silhouette_score(cosine_dist_matrix, clst5)

### Summarize workload composition patterns

In [None]:
unique, counts = np.unique(clst5, return_counts=True)
print(unique, counts)

In [None]:
pattern_names = ['PR Discussant', 'Issue Reporter', 'Code Reviewer', 'Committer', 'Issue Discussant']
pattern_code = [4,1,5,3,2]
filtered_developer_period_df['wcp'] = clst5
filtered_developer_period_df['wcp'] = filtered_developer_period_df['wcp'].apply(lambda x:pattern_names[x-1])
filtered_developer_period_df['wcp_code'] = filtered_developer_period_df['wcp'].apply(lambda x:pattern_code[pattern_names.index(x)])

In [33]:
# get the center point of each pattern
centers = []
for i in np.unique(clst5):
    indices = np.where(clst5 == i)[0]
    cluster_dist = cosine_sim_matrix[indices]
    cluster_dist = cluster_dist[:, indices]
    sum_distances = np.sum(cluster_dist, axis=1)
    centroid_index = np.argmax(sum_distances)
    centroid = filtered_developer_period_df.iloc[indices[centroid_index]]
    centers.append(centroid)
centers = pd.DataFrame(centers)

In [None]:
for i in range(5):
    #print('pattern:',i)
    print('pattern:', centers.iloc[i]['wcp'],  centers.iloc[i]['wcp_code'])
    angles = [n / float(5) * 2 * math.pi for n in range(5)]
    angles += angles[:1]
    values =  list( centers.iloc[i][normalized_feature].values)
    values += values[:1]
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)
     
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], features, color='black', size=22)
     
    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], ['0','0.1','0.2','0.3','0.4','0.5','0.6','0.7','0.8','0.9','1'], color="grey", size=7)
    #plt.ylim([0, 1])
     
    # Plot data
    ax.plot(angles, values, linewidth=1, linestyle='solid')
     
    # Fill area
    ax.fill(angles, values, 'b', alpha=0.1)

    plt.savefig(os.path.join(basedir,f"workload_pattern_{pattern_code[i]}.png"),dpi=300,bbox_inches='tight',facecolor='none')
    # Show the graph
    plt.show()

## Extract work preference features

### Get OSS activity time series per period

In [None]:
# This might take 5 mins
def get_period_activity_time_series(row, activity_name=None):
    days = int((row['end'] - row['start']).days)
    bin = [0 for i in range(days)]
    activities = activity_dict[projects.index(row['project'])][row['name']]
    activities = list(filter(lambda x: x[1]>=row['start'] and x[1] <row['end'], activities))
    if activity_name:
        activities = list(filter(lambda x: x[0]==activity_name, activities))
    activities = list(map(lambda x:int((x[1]-row['start']).days), activities))
    for d in activities:
        bin[d] += 1
    return bin
filtered_developer_period_df[['commit ts',	'issue ts',	'issue comment ts', 'pr comment ts', 'review ts']] = None
filtered_developer_period_df['commit ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x,'commit'), axis=1)
filtered_developer_period_df['issue ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x,'issue'), axis=1)
filtered_developer_period_df['issue comment ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x,'issue comment'), axis=1)
filtered_developer_period_df['pr comment ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x,'pr comment'), axis=1)
filtered_developer_period_df['review ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x,'review'), axis=1)
filtered_developer_period_df['all activity ts'] = filtered_developer_period_df.apply(lambda x:get_period_activity_time_series(x), axis=1)

### tsfresh time series feature extraction

In [None]:
ts_features = filtered_developer_period_df[['name', 'project', 'profile', 'period', 'all activity ts', 'wcp_code']]
tsfresh_features = ['number_cwt_peaks','binned_entropy','longest_strike_above_mean', 'longest_strike_below_mean', 'c3(1)', 'c3(2)', 'c3(3)']
ts_features['number_cwt_peaks']  = ts_features['all activity ts'].apply(lambda x:fc.number_cwt_peaks(np.array(x), 5 ))
ts_features['approximate_entropy']  = ts_features['all activity ts'].apply(lambda x:fc.approximate_entropy(np.array(x), 2,0.1 ))
ts_features['binned_entropy'] = ts_features['all activity ts'].apply(lambda x:fc.binned_entropy(np.array(x), 10)) # level of disorder, unpredictability, or randomness
ts_features['cid_ce'] = ts_features['all activity ts'].apply(lambda x:fc.cid_ce(np.array(x), False)) #complexity 
ts_features['sample_entropy'] = ts_features['all activity ts'].apply(lambda x:fc.sample_entropy(np.array(x))) #complexity 

ts_features['value_count'] = ts_features['all activity ts'].apply(lambda x:fc.value_count(np.array(x),0))
ts_features['count_below'] = ts_features['all activity ts'].apply(lambda x:fc.count_below(np.array(x),0))
ts_features['longest_strike_above_mean']  = ts_features['all activity ts'].apply(lambda x:fc.longest_strike_above_mean(np.array(x))/len(x)) # ratio
ts_features['longest_strike_below_mean']  = ts_features['all activity ts'].apply(lambda x:fc.longest_strike_below_mean(np.array(x))/len(x)) # ratio


ts_features['c3(1)'] = ts_features['all activity ts'].apply(lambda x:fc.c3(np.array(x),1))
ts_features['c3(2)'] = ts_features['all activity ts'].apply(lambda x:fc.c3(np.array(x),2))
ts_features['c3(3)'] = ts_features['all activity ts'].apply(lambda x:fc.c3(np.array(x),3))
ts_features['mean_change'] = ts_features['all activity ts'].apply(lambda x:fc.mean_change(np.array(x)))

In [None]:
filtered_developer_period_df[tsfresh_features] = ts_features[tsfresh_features]

### diverse and balance

In [None]:
work_preference_features = tsfresh_features + ['balance','diverse']
filtered_developer_period_df['diverse'] = (filtered_developer_period_df[features] != 0).sum(axis=1)
filtered_developer_period_df['balance'] = 1/filtered_developer_period_df[normalized_feature].std(axis=1)
# replace inf with a big number to prevent future calculation problems
filtered_developer_period_df['balance'].replace([np.inf], 6000, inplace=True)

## Technical Importance

In [63]:
for i in range(len(projects)):
    proj_commit = project_commit_dict[i]
    file_graph = nx.Graph()
    file_graph.add_node('root')
    commit_sorted = dict(sorted(proj_commit.items(), key=lambda item: item[1]['Time']))
    for commit_id in commit_sorted:
        commit = commit_sorted[commit_id]
        files = commit['Changed Files']
        if len(files) > 0:
            for file in files:
                file_parts = file.split('/')
                file_graph.add_node(file_parts[0])
                file_graph.add_edge('root', file_parts[0])
                if len(file_parts) > 1:
                    current_node = file_parts[0]
                    for part in file_parts[1:]:
                        file_graph.add_node(os.path.join(current_node, part))
                        file_graph.add_edge(current_node, os.path.join(current_node, part))
                        current_node = os.path.join(current_node, part)
        #     graph_eigenvector_centrality = nx.eigenvector_centrality(file_graph)
        #     file_eigenvector_centrality = list(map(lambda x:graph_eigenvector_centrality[x], files))
        #     commit['centrality'] = file_eigenvector_centrality   
        # else:
        #     commit['centrality'] = []
                        
    eigenvector_centrality = nx.eigenvector_centrality(file_graph,max_iter=500)
    project_commit_df[i]['centrality'] = project_commit_df[i]['Changed Files'].apply(lambda x:[eigenvector_centrality[item] for item in x])

In [64]:
for i in range(len(projects)):
    project_commit_df[i]['sum centrality'] = project_commit_df[i]['centrality'].apply(lambda x:np.sum(x) if len(x) > 0 else 0)
    project_commit_df[i]['max centrality'] = project_commit_df[i]['centrality'].apply(lambda x:np.max(x) if len(x) > 0 else 0)
    project_commit_df[i]['avg centrality'] = project_commit_df[i]['centrality'].apply(lambda x:np.mean(x)  if len(x) > 0 else 0)
    project_commit_df[i]['med centrality'] = project_commit_df[i]['centrality'].apply(lambda x:np.median(x)  if len(x) > 0 else 0)
project_commits = [item.to_dict('index') for item in project_commit_df]

In [65]:
def period_centrality(row, commit_ids, commit_dict, commit_metric = 'avg', period_metric = 'sum'):
    if len(commit_ids) <= 0:
        return 0
    commit_metric = commit_metric +' centrality'
    commit_ids = sorted(commit_ids, key=lambda x:commit_dict[x]['Time'])
    commit_ids = list(filter(lambda x: commit_dict[x]['Time'] >= row['start'] and commit_dict[x]['Time'] < row['end'], commit_ids))
    if len(commit_ids) <= 0:
        return 0
    period_commit_centrality = [commit_dict[idx][commit_metric] for idx in commit_ids]
    if period_metric == 'avg':
        return np.mean(period_commit_centrality)
    elif period_metric == 'max':
        return np.max(period_commit_centrality)
    elif period_metric == 'median':
        return np.median(period_commit_centrality)
    elif period_metric == 'sum':
        return np.sum(period_commit_centrality)
    return None  

In [66]:
for i in range(len(projects)):
    proj = projects[i]
    dev_commits_dict = project_commit_df[i].groupby(['Author']).apply(lambda x: list(x.index)).to_dict()
    filtered_developer_period_df.loc[filtered_developer_period_df['project']==proj, 'period commit centrality'] = filtered_developer_period_df.loc[filtered_developer_period_df['project']==proj].apply(
        lambda x:period_centrality(x, dev_commits_dict[x['name']], project_commits[i], 'avg', 'sum'), axis=1)

## Store extracted features

### Sort each developer's activities in periods into a sequence

In [80]:
def bin_periods_to_sequence(proj, name, feature_name):
    df = filtered_developer_period_df.loc[(filtered_developer_period_df['project']==proj)&(filtered_developer_period_df['name']==name)]
    if len(df) == 1:
        return df[feature_name].to_list()
    elif len(df) == 0:
        return None
    seq_dict = pd.Series(df[feature_name].values, index=df['period']).to_dict()
    seq_dict = dict(sorted(seq_dict.items()))
    seq = list(seq_dict.values())
    return seq

In [None]:
developer_sequence = developer_data[['name', 'project','profile', 'join date']]
developer_sequence['workload_sequence'] = developer_sequence.apply(lambda x:bin_periods_to_sequence(x['project'],x['name'], 'wcp_code'),axis=1)
developer_sequence = developer_sequence.loc[~developer_sequence['workload_sequence'].isna()]

In [82]:
# bin period work preference into sequence
for fea in work_preference_features:
    developer_sequence[f'{fea}_sequence'] = developer_sequence.apply(lambda x:bin_periods_to_sequence(x['project'],x['name'], fea),axis=1)

In [84]:
# bin period work preference into sequence
for fea in features:
    developer_sequence[f'{fea}_sequence'] = developer_sequence.apply(lambda x:bin_periods_to_sequence(x['project'],x['name'], fea),axis=1)

In [86]:
# bin period technical importance into sequence
fea = 'period commit centrality'
developer_sequence[f'{fea}_sequence'] = developer_sequence.apply(lambda x:bin_periods_to_sequence(x['project'],x['name'], fea),axis=1)

In [87]:
# bin per commit importance into sequence
def get_per_commit_centrality_sequence(commit_ids, commit_dict, commit_metric = 'avg'):
    if len(commit_ids) <= 0:
        return 0
    commit_metric = commit_metric +' centrality'
    commit_ids = sorted(commit_ids, key=lambda x:commit_dict[x]['Time'])

    return [commit_dict[idx][commit_metric] for idx in commit_ids]

for i in range(len(projects)):
    proj = projects[i]
    dev_commits_dict = project_commit_df[i].groupby(['Author']).apply(lambda x: list(x.index)).to_dict()
    developer_sequence.loc[developer_sequence['project']==proj, 'per commit centrality_sequence'] = developer_sequence.loc[developer_sequence['project']==proj]['name'].apply(
        lambda x:get_per_commit_centrality_sequence(dev_commits_dict[x], project_commits[i], 'avg')) 

### calculate technical importance metrics

In [88]:
def get_max_centrality_day(row, commit_ids, commit_dict, commit_metric = 'avg'):
    commit_metric = commit_metric +' centrality'
    max_commit_id = max(commit_ids, key=lambda x:commit_dict[x][commit_metric])
    return (commit_dict[max_commit_id]['Time'] - row['join date']).days


developer_sequence['max_commit_centrality'] = developer_sequence['per commit centrality_sequence'].apply(max)
developer_sequence['max_period_centrality'] = developer_sequence['period commit centrality_sequence'].apply(max)
developer_sequence['max_centrality_period'] = developer_sequence['period commit centrality_sequence'].apply(lambda x: np.argmax(x)+1)
for i in range(len(projects)):
    proj = projects[i]
    dev_commits_dict = project_commit_df[i].groupby(['Author']).apply(lambda x: list(x.index)).to_dict()
    developer_sequence.loc[developer_sequence['project']==proj, 'max_centrality_day'] = developer_sequence.loc[developer_sequence['project']==proj].apply(
        lambda x:get_max_centrality_day(x, dev_commits_dict[x['name']], project_commits[i], 'avg'), axis=1) 

### store data

In [79]:
filtered_developer_period_df.to_csv(os.path.join(basedir,'contributor_period_activity.csv'))

In [89]:
developer_sequence.to_csv(os.path.join(basedir,'contributor_activity_sequence.csv'))