In [2]:
import numpy as np
import pandas as pd
import os
import datetime
import pandas as pd
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from rpy2.robjects.packages import importr

## Read data

In [3]:
def date_time_handler(date_time_str):
    return datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')

In [11]:
basedir = os.path.join("..","data")
developer_data = pd.read_csv(os.path.join(basedir, 'contributor_features.csv'), index_col = 0)
filtered_developer_period_df = pd.read_csv(os.path.join(basedir,'contributor_period_activity.csv'), index_col = 0)

In [12]:
basedir = os.path.join(os.getcwd(), '..','data')
proj_names = ['tensorflow_tensorflow',
              'pytorch_pytorch',
                'scikit-learn_scikit-learn',
                'keras-team_keras',
                'apache_mxnet',
                'theano_aesara',
                'onnx_onnx',
                'deeplearning4j_deeplearning4j']

repo_dirs = [os.path.join(basedir,item) for item in proj_names]
projects = ['tensorflow', 'pytorch', 'scikit-learn', 'keras', 'mxnet', 'theano_aesara', 'onnx', 'deeplearning4j']

In [13]:
# studied period
start = datetime.datetime(2008,1,1)
end = datetime.datetime(2024,1,1)

In [82]:
# read project fork history
project_fork_df = []
for path in repo_dirs:
    fork_df = pd.read_csv(os.path.join(path, 'fork_history.csv'), index_col=0)
    fork_df['time'] = fork_df['time'].apply(lambda x:date_time_handler(x.replace('+00:00','')))
    fork_df = fork_df.loc[(fork_df['time']>=start)&(fork_df['time']<end)]
    project_fork_df.append(fork_df)

In [83]:
# read project star history
project_star_df = []
for path in repo_dirs:
    stargazer_df = pd.read_csv(os.path.join(path, 'star_history.csv'), index_col=0)
    #stargazer_df['starredAt'] = stargazer_df['starredAt'].apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
    stargazer_df['starredAt'] = stargazer_df['starredAt'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    stargazer_df = stargazer_df.loc[(stargazer_df['starredAt']>=start)&(stargazer_df['starredAt']<end)]
    project_star_df.append(stargazer_df)

In [None]:
# read project commit data
project_commit_df = []

for proj_dir in repo_dirs:
    commit_df = pd.read_csv(os.path.join(proj_dir,'commit_main.csv'), index_col=0)

    commit_df['Time'] = pd.to_datetime(commit_df['Time'])
    commit_df['Changed Files'] = commit_df['Changed Files'].apply(lambda x: x.split() if isinstance(x, str) else [])
    commit_df['Parents'] = commit_df['Parents'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_commit_df.append(commit_df)
for df in project_commit_df:
    df.set_index('Commit#', inplace=True)
project_commit_dict = [df.to_dict(orient='index') for df in project_commit_df ]

In [None]:
# read project prs and issues
project_pr_df = []
for proj_dir in repo_dirs:    
    pr_df = pd.read_csv(os.path.join(proj_dir,'pull_request_main.csv'), index_col=0)
    pr_df['Opened time'] = pd.to_datetime(pr_df['Opened time'])
    pr_df['Closed time'] = pd.to_datetime(pr_df['Closed time'])
    pr_df['Labels'] = pr_df['Labels'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Assignees'] = pr_df['Assignees'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Reviewers'] = pr_df['Reviewers'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Participants'] = pr_df['Participants'].apply(lambda x: x.split() if isinstance(x, str) else [])
    pr_df['Commits'] = pr_df['Commits'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_pr_df.append(pr_df)
project_pr_dict = [df.to_dict(orient='index') for df in project_pr_df ]

project_issue_df = []
for proj_dir in repo_dirs: 
    issue_df = pd.read_csv(os.path.join(proj_dir,'issue_main.csv'), index_col=0)
    issue_df['Opened time'] = pd.to_datetime(issue_df['Opened time'])
    issue_df['Closed time'] = pd.to_datetime(issue_df['Closed time'])
    issue_df['Labels'] = issue_df['Labels'].apply(lambda x: x.split() if isinstance(x, str) else [])
    issue_df['Assignees'] = issue_df['Assignees'].apply(lambda x: x.split() if isinstance(x, str) else [])
    issue_df['Participants'] = issue_df['Participants'].apply(lambda x: x.split() if isinstance(x, str) else [])
    project_issue_df.append(issue_df)
project_issue_dict = [df.to_dict(orient='index') for df in project_issue_df ]

## Prepare dependent variable

### Get 90-days periods for each project

In [101]:
def get_period_stars(i,x):
    return len(project_star_df[i].loc[(project_star_df[i]['starredAt']>= x['start'])&(project_star_df[i]['starredAt']< x['end'])])
    
def get_period_forks(i,x):
    return len(project_fork_df[i].loc[(project_fork_df[i]['time']>= x['start'])&(project_fork_df[i]['time']< x['end'])])

In [69]:
project_duration = []
for i in range(len(projects)):
    project_duration.append({'start':min(project_commit_df[i]['Time'].min(),project_issue_df[i]['Opened time'].min(),
                                         project_pr_df[i]['Opened time'].min()),
                            'end':end})

In [102]:
# split project lifespan into 90 days periods
num_days = 90
project_stage_df = []
project_metric = ['start', 'end','star','fork']
for i in range(len(projects)):
    df = pd.DataFrame(columns=project_metric)
    bin_dates = [[],[]]
    cur = project_duration[i]['start']
    delta = datetime.timedelta(days=num_days)
    while cur+delta < project_duration[i]['end']:
        bin_dates[0].append(cur)
        bin_dates[1].append(cur+delta)
        cur += delta
    df['start'] = bin_dates[0]
    df['end'] = bin_dates[1]
    project_stage_df.append(df)

### Bin the number of forks and stars into 90 days periods

In [103]:
for i in range(len(projects)):
    project_stage_df[i]['project'] = projects[i]
    project_stage_df[i]['period'] = project_stage_df[i].index
    project_stage_df[i]['star'] = project_stage_df[i].apply(lambda x: get_period_stars(i,x), axis=1)
    project_stage_df[i]['fork'] = project_stage_df[i].apply(lambda x: get_period_forks(i,x), axis=1)

In [None]:
# Remove the initial periods for pytorch and theano where the projects might not be public yet (fork=0)
df = project_stage_df[projects.index('pytorch')]
df = df.loc[~df['period'].between(0, 19)]
df['period'] = df['period'].apply(lambda x:x-20)
project_stage_df[projects.index('pytorch')] = df

df = project_stage_df[projects.index('theano_aesara')]
df = df.loc[~df['period'].between(0, 13)]
df['period'] = df['period'].apply(lambda x:x-14)
project_stage_df[projects.index('theano_aesara')] = df

df = project_stage_df[projects.index('scikit-learn')]
df = df.loc[~df['period'].between(0, 1)]
df['period'] = df['period'].apply(lambda x:x-2)
project_stage_df[projects.index('scikit-learn')] = df

df = project_stage_df[projects.index('mxnet')]
df = df.loc[~df['period']==0]
df['period'] = df['period'].apply(lambda x:x-1)
project_stage_df[projects.index('mxnet')] = df

In [None]:
# Merge all projects and obtain the dependent variables
lemr_data = pd.concat(project_stage_df,ignore_index = True)
lemr_data

## Prepare indenpendent variables

In [106]:
lemr_data['project_code'] = lemr_data['project'].apply(lambda x:projects.index(x))

### workload composition related variables

In [None]:
for i in range(len(projects)):
    periods = lemr_data.loc[lemr_data['project'] == projects[i]]['period'].unique()
    for p in periods:
        #print(projects[i],p)
        try:
            pavtivity = filtered_developer_period_df.loc[(filtered_developer_period_df['project'] == projects[i])&(filtered_developer_period_df['period'] == p)]
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_ratio'] = len(pavtivity.loc[pavtivity['wcp_code']==1])/len(pavtivity) if len(pavtivity) > 0 else 0
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_ratio'] = len(pavtivity.loc[pavtivity['wcp_code']==2])/len(pavtivity) if len(pavtivity) > 0 else 0
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_ratio'] = len(pavtivity.loc[pavtivity['wcp_code']==3])/len(pavtivity) if len(pavtivity) > 0 else 0
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_ratio'] = len(pavtivity.loc[pavtivity['wcp_code']==4])/len(pavtivity) if len(pavtivity) > 0 else 0
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_ratio'] = len(pavtivity.loc[pavtivity['wcp_code']==5])/len(pavtivity) if len(pavtivity) > 0 else 0
            

            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_commit_ratio'] = pavtivity.loc[pavtivity['wcp_code']==1]['commit'].sum()/pavtivity['commit'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_issue_ratio'] = pavtivity.loc[pavtivity['wcp_code']==1]['issue'].sum()/pavtivity['issue'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_issue_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==1]['issue comment'].sum()/pavtivity['issue comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_pr_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==1]['pr comment'].sum()/pavtivity['pr comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern1_review_ratio'] = pavtivity.loc[pavtivity['wcp_code']==1]['review'].sum()/pavtivity['review'].sum()

            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_commit_ratio'] = pavtivity.loc[pavtivity['wcp_code']==2]['commit'].sum()/pavtivity['commit'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_issue_ratio'] = pavtivity.loc[pavtivity['wcp_code']==2]['issue'].sum()/pavtivity['issue'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_issue_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==2]['issue comment'].sum()/pavtivity['issue comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_pr_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==2]['pr comment'].sum()/pavtivity['pr comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern2_review_ratio'] = pavtivity.loc[pavtivity['wcp_code']==2]['review'].sum()/pavtivity['review'].sum()

            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_commit_ratio'] = pavtivity.loc[pavtivity['wcp_code']==3]['commit'].sum()/pavtivity['commit'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_issue_ratio'] = pavtivity.loc[pavtivity['wcp_code']==3]['issue'].sum()/pavtivity['issue'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_issue_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==3]['issue comment'].sum()/pavtivity['issue comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_pr_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==3]['pr comment'].sum()/pavtivity['pr comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern3_review_ratio'] = pavtivity.loc[pavtivity['wcp_code']==3]['review'].sum()/pavtivity['review'].sum()

            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_commit_ratio'] = pavtivity.loc[pavtivity['wcp_code']==4]['commit'].sum()/pavtivity['commit'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_issue_ratio'] = pavtivity.loc[pavtivity['wcp_code']==4]['issue'].sum()/pavtivity['issue'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_issue_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==4]['issue comment'].sum()/pavtivity['issue comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_pr_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==4]['pr comment'].sum()/pavtivity['pr comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern4_review_ratio'] = pavtivity.loc[pavtivity['wcp_code']==4]['review'].sum()/pavtivity['review'].sum()

            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_commit_ratio'] = pavtivity.loc[pavtivity['wcp_code']==5]['commit'].sum()/pavtivity['commit'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_issue_ratio'] = pavtivity.loc[pavtivity['wcp_code']==5]['issue'].sum()/pavtivity['issue'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_issue_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==5]['issue comment'].sum()/pavtivity['issue comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_pr_comment_ratio'] = pavtivity.loc[pavtivity['wcp_code']==5]['pr comment'].sum()/pavtivity['pr comment'].sum()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pattern5_review_ratio'] = pavtivity.loc[pavtivity['wcp_code']==5]['review'].sum()/pavtivity['review'].sum()
        except Exception as e:
            print(projects[i],p)
            print(e)
            raise
lemr_data.fillna(0, inplace=True)

In [None]:
# identify correlated variables
workload_variables = [

       'pattern1_ratio', 'pattern2_ratio', 'pattern3_ratio', 'pattern4_ratio', 'pattern5_ratio', 
       
       'pattern1_commit_ratio', 'pattern1_issue_ratio',
       'pattern1_issue_comment_ratio', 'pattern1_pr_comment_ratio',
       'pattern1_review_ratio', 'pattern2_commit_ratio',
       'pattern2_issue_ratio', 'pattern2_issue_comment_ratio',
       'pattern2_pr_comment_ratio', 'pattern2_review_ratio',
       'pattern3_commit_ratio', 'pattern3_issue_ratio',
       'pattern3_issue_comment_ratio', 'pattern3_pr_comment_ratio',
       'pattern3_review_ratio', 'pattern4_commit_ratio',
       'pattern4_issue_ratio', 'pattern4_issue_comment_ratio',
       'pattern4_pr_comment_ratio', 'pattern4_review_ratio',
       'pattern5_commit_ratio', 'pattern5_issue_ratio',
       'pattern5_issue_comment_ratio', 'pattern5_pr_comment_ratio',
       'pattern5_review_ratio'
]
correlated_features = ['pattern1_commit_ratio','pattern1_issue_comment_ratio','pattern1_pr_comment_ratio','pattern1_review_ratio',
                       'pattern2_pr_comment_ratio','pattern2_commit_ratio','pattern2_issue_ratio',
                       'pattern3_commit_ratio','pattern3_issue_comment_ratio','pattern3_pr_comment_ratio',
                       'pattern4_ratio','pattern4_issue_comment_ratio','pattern4_pr_comment_ratio',
                       'pattern5_ratio','pattern5_commit_ratio','pattern5_issue_comment_ratio','pattern5_pr_comment_ratio']
# in old paper
# correlated_features = [ 'pattern1_commit_ratio', 'pattern1_issue_comment_ratio','pattern1_pr_comment_ratio',
#                        'pattern2_pr_comment_ratio', 'pattern2_commit_ratio', 
#                        'pattern3_commit_ratio', 'pattern3_issue_comment_ratio', 'pattern3_pr_comment_ratio',
#                        'pattern4_ratio', 'pattern4_issue_comment_ratio', 'pattern4_pr_comment_ratio',
#                        'pattern5_ratio','pattern5_commit_ratio','pattern5_issue_ratio', 'pattern5_issue_comment_ratio', 'pattern5_pr_comment_ratio'
                       
#     ]
workload_variables_noncor = [item for item in workload_variables if not item in correlated_features]
correlation_matrix = lemr_data[workload_variables_noncor].corr(method='spearman')
cutoff = (abs(correlation_matrix) > 0.5)
correlated = [(col, row) for col in correlation_matrix.columns for row in correlation_matrix.index if cutoff.at[row, col] and not row==col]
correlated

### work preference related variables

In [109]:
for i in range(len(projects)):
    periods = lemr_data.loc[lemr_data['project'] == projects[i]]['period'].unique()
    for p in periods:
        try:
            pavtivity = filtered_developer_period_df.loc[(filtered_developer_period_df['project'] == projects[i])&(filtered_developer_period_df['period'] == p)]
            if len(pavtivity) <= 0:
                continue
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'binned_entropy'] = pavtivity['binned_entropy'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'c3(1)'] = pavtivity['c3(1)'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'c3(2)'] = pavtivity['c3(2)'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'c3(3)'] = pavtivity['c3(3)'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'number_cwt_peaks'] = pavtivity['number_cwt_peaks'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'longest_strike_above_mean'] = pavtivity['longest_strike_above_mean'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'longest_strike_below_mean'] = pavtivity['longest_strike_below_mean'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'balance'] = pavtivity['balance'].median()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'diverse'] = pavtivity['diverse'].median()
         
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'commit'] = pavtivity['commit'].mean()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'issue'] = pavtivity['issue'].mean()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'issue_comment'] = pavtivity['issue comment'].mean()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'pr_comment'] = pavtivity['pr comment'].mean()
            lemr_data.loc[(lemr_data['project'] == projects[i])&(lemr_data['period'] == p), 'review'] = pavtivity['review'].mean()
        except Exception as e:
            print(projects[i],p)
            print(e)
            raise
lemr_data.fillna(0, inplace=True)

In [None]:
# identify correlated variables
work_preference_variables = ['period', 'binned_entropy', 'c3(1)', 'c3(2)', 'c3(3)', 'number_cwt_peaks', 'longest_strike_above_mean', 
                             'longest_strike_below_mean', 'diverse', 'balance', 'commit', 'issue', 'issue_comment', 'pr_comment', 
                             'review']
#correlated_variables = ['binned_entropy', 'longest_strike_below_mean', 'c3(2)', 'c3(3)']
correlated_variables =['longest_strike_above_mean', 'longest_strike_below_mean','number_cwt_peaks', 'c3(2)', 'c3(3)']
# In old paper
# correlated_variables = ['number_cwt_peaks','diverse', 'longest_strike_above_mean', 'c3(1)', 'c3(2)', 'c3(3)', 'pr_comment']
work_preference_variables_noncor = [item for item in work_preference_variables if not item in correlated_variables]
correlation_matrix = lemr_data[work_preference_variables_noncor].corr(method='spearman')
cutoff = (abs(correlation_matrix) > 0.7)
correlated = [(col, row) for col in correlation_matrix.columns for row in correlation_matrix.index if cutoff.at[row, col] and not row==col]
correlated

In [111]:
lemr_data['c3_1'] = lemr_data['c3(1)']
work_preference_variables_noncor = [item.replace('(','_').replace(')','') for item in work_preference_variables_noncor]

## Mixed effect model

In [None]:
# install this package if you have not install it, otherwise donot run this cell
utils = importr('utils')
base = importr('base')
utils.chooseCRANmirror(ind=1)
utils.install_packages('lme4')

In [113]:
def arrow(num):
    return '↗' if num > 0 else '↘'
def signf_code(chisq):
    if chisq <= 0.001:
        return '***'
    elif chisq <= 0.01:
        return '**'
    elif chisq <= 0.05:
        return '*'
    elif chisq <= 0.1:
        return '.'
    else:
        return ' '

In [35]:
def min_max_normalize(column):
    col_min = column.min()
    col_max = column.max()
    if col_min == col_max:
        return 0 if col_min == 0 else column / col_min
    else:
        return (column - col_min) / (col_max - col_min)

In [62]:
def fit_mixedlm(formula, df, groups, reformula=None):
    model = smf.mixedlm(formula, df, groups=groups, re_formula = reformula)
    rslt = model.fit(method=["lbfgs"])
    pred = rslt.predict()
    var_resid = rslt.scale
    var_random_effect = float(rslt.cov_re.iloc[0][0])
    var_fixed_effect = pred.var()
    total_var = var_fixed_effect + var_random_effect + var_resid
    marginal_r2 = var_fixed_effect / total_var
    conditional_r2 = (var_fixed_effect + var_random_effect) / total_var
    return marginal_r2, conditional_r2, rslt 

### Model: work preference - star

In [None]:
formula = "star ~ " + ' + '.join(work_preference_variables_noncor)
print(formula)
# Remove the initial period
df = lemr_data.loc[lemr_data['period']!=0]
for i in range(len(projects)):
    df.loc[df['project']==projects[i],'star'] = min_max_normalize(df.loc[df['project']==projects[i]]['star'])
    df.loc[df['project']==projects[i],'fork'] = min_max_normalize(df.loc[df['project']==projects[i]]['fork'])
df[['project', 'star','fork']]
mr2, cr2, rslt = fit_mixedlm(formula, df, df['project_code'])
print('conditional r2', cr2)
print('marginal r2', mr2)
print(rslt.summary())

In [None]:
rslt_summary = rslt.pvalues.apply(signf_code).to_frame(name='Signif.')
rslt_summary['Rel.'] = rslt.params.apply(arrow)
rslt_summary.head(50)

### Model: work preference - fork

In [None]:
formula = "fork ~ " + ' + '.join(work_preference_variables_noncor)
print(formula)
df = lemr_data.loc[lemr_data['period']!=0]
for i in range(len(projects)):
    df.loc[df['project']==projects[i],'star'] = min_max_normalize(df.loc[df['project']==projects[i]]['star'])
    df.loc[df['project']==projects[i],'fork'] = min_max_normalize(df.loc[df['project']==projects[i]]['fork'])
df[['project', 'star','fork']]
mr2, cr2, rslt = fit_mixedlm(formula, df, df['project_code'])
print('conditional r2', cr2)
print('marginal r2', mr2)
print(rslt.summary())

In [None]:
rslt_summary = rslt.pvalues.apply(signf_code).to_frame(name='Signif.')
rslt_summary['Rel.'] = rslt.params.apply(arrow)
rslt_summary.head(50)

### Model: workload composition - star

In [None]:
formula = "star ~ " + ' + '.join(workload_variables_noncor)
print(formula)
df = lemr_data.loc[lemr_data['period']!=0]
for i in range(len(projects)):
    df.loc[df['project']==projects[i],'star'] = min_max_normalize(df.loc[df['project']==projects[i]]['star'])
    df.loc[df['project']==projects[i],'fork'] = min_max_normalize(df.loc[df['project']==projects[i]]['fork'])
df[['project', 'star','fork']]
mr2, cr2, rslt = fit_mixedlm(formula, df, df['project_code'])
print('conditional r2', cr2)
print('marginal r2', mr2)
print(rslt.summary())

In [None]:
rslt_summary = rslt.pvalues.apply(signf_code).to_frame(name='Signif.')
rslt_summary['Rel.'] = rslt.params.apply(arrow)
rslt_summary.head(50)

### Model: workload composition - fork

In [None]:
formula = "fork ~ " + ' + '.join(workload_variables_noncor)
print(formula)
df = lemr_data.loc[lemr_data['period']!=0]
for i in range(len(projects)):
    df.loc[df['project']==projects[i],'star'] = min_max_normalize(df.loc[df['project']==projects[i]]['star'])
    df.loc[df['project']==projects[i],'fork'] = min_max_normalize(df.loc[df['project']==projects[i]]['fork'])
df[['project', 'star','fork']]
mr2, cr2, rslt = fit_mixedlm(formula, df, df['project_code'])
print('conditional r2', cr2)
print('marginal r2', mr2)
print(rslt.summary())

In [None]:
rslt_summary = rslt.pvalues.apply(signf_code).to_frame(name='Signif.')
rslt_summary['Rel.'] = rslt.params.apply(arrow)
rslt_summary.head(50)