In [2]:
import json
import pandas as pd
import numpy as np

In [3]:
# Specify the covariates file to use
covariates_file = 'covariates.json'

In [5]:
df_raw = pd.read_json(covariates_file)

In [9]:
# print(df_raw.columns)

Index(['id', 'name', 'isFork', 'commits', 'branches', 'releases', 'forks',
       'mainLanguage', 'defaultBranch', 'license', 'homepage', 'watchers',
       'stargazers', 'contributors', 'size', 'createdAt', 'pushedAt',
       'updatedAt', 'totalIssues', 'openIssues', 'totalPullRequests',
       'openPullRequests', 'blankLines', 'codeLines', 'commentLines',
       'metrics', 'lastCommit', 'lastCommitSHA', 'hasWiki', 'isArchived',
       'isDisabled', 'isLocked', 'languages', 'labels', 'topics'],
      dtype='object')
0      2009-12-10T12:51:14
1      2010-05-17T09:27:02
2      2010-09-06T09:39:43
3      2010-11-09T09:22:21
4      2011-02-04T02:40:00
              ...         
389    2015-08-04T07:01:06
390    2014-02-06T12:18:47
391    2014-06-06T10:56:04
392    2012-06-04T02:49:46
393    2013-12-14T09:47:37
Name: createdAt, Length: 394, dtype: object


In [30]:
def filter_language(langs):
    # langs is a dictionary of languages and their bytes
    # we want to return the most used language
    if not langs:
        return 'None'
    else:
        return max(langs, key=langs.get)

def get_age_seconds(age):
    # returns the age of a repo in seconds
    return (pd.to_datetime('today') - pd.to_datetime(age)).total_seconds()


In [223]:
# opvs are the covariates we are interested in
opvs = ['name', 'size', 'contributors', 'watchers',
        'forks', 'stargazers', 'createdAt',
        'license', 'languages']

df_filtered = df_raw[opvs]

# get the most used language
df_filtered['language'] = df_raw['languages'].apply(filter_language)
# get the age of the repo in days
df_filtered['age'] = df_raw['createdAt'].apply(get_age_seconds)
# convert the age to days
df_filtered['age_days'] = df_filtered['age'] / (60*60*24)
df_final = df_filtered.drop(['languages'], axis=1)


# print(df_final.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['language'] = df_raw['languages'].apply(filter_language)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['age'] = df_raw['createdAt'].apply(get_age_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['age_days'] = df_filtered['age'] / (60*60*24)


In [225]:
# get sample summary statistics, and convert to latex

summary = df_final.describe()

print(summary.to_latex())
transposed = summary.T
transposed = transposed.drop(['count'], axis=1)


print(transposed.to_latex(index=False, float_format='%.2f'))


\begin{tabular}{lrrrrrrr}
\toprule
 & size & contributors & watchers & forks & stargazers & age & age_days \\
\midrule
count & 394.000000 & 393.000000 & 394.000000 & 394.000000 & 394.000000 & 394.000000 & 394.000000 \\
mean & 256293.167513 & 204.608142 & 564.251269 & 4101.934010 & 17251.862944 & 337126370.057169 & 3901.925579 \\
std & 525664.305803 & 146.514327 & 1005.098236 & 8527.580280 & 34180.806563 & 54610608.144787 & 632.067224 \\
min & 15.000000 & 1.000000 & 0.000000 & 2.000000 & 10.000000 & 56447613.326101 & 653.328858 \\
25% & 31868.000000 & 76.000000 & 86.250000 & 426.250000 & 1450.000000 & 294301686.079039 & 3406.269515 \\
50% & 91607.500000 & 181.000000 & 253.500000 & 1432.000000 & 6520.500000 & 329480577.286973 & 3813.432607 \\
75% & 238346.250000 & 341.000000 & 601.000000 & 3748.500000 & 18226.000000 & 373068917.018838 & 4317.927280 \\
max & 5882661.000000 & 1230.000000 & 8503.000000 & 78315.000000 & 384563.000000 & 502034098.328888 & 5810.579842 \\
\bottomrule
\end{tabul

In [226]:
# specify file that contains treatment data
treated_repos_data = '../data/treatment-new.json'


In [227]:
treated = []
ser = pd.read_json(treated_repos_data, typ='series')

for k, v in ser.items():
    # skip if v is NaT
    if pd.isnull(v):
        continue
    
    treated.append('/'.join(k.split('--')))

print(len(treated))

449


In [228]:
df_treatment = []
df_control = []
for _, row in df_final.iterrows():
    if row['name'] in treated:
        df_treatment.append(row)
    else:
        df_control.append(row)


df_treatment = pd.DataFrame(df_treatment)
df_control = pd.DataFrame(df_control)
print(len(df_treatment))
# print(df_treatment.head())
print(len(df_control))
# print(df_control.head())


306
88


In [229]:
# get summary statistics of each group

covariates = ['size', 'contributors', 'watchers', 'forks', 'stargazers', 'age_days']

t_describe = df_treatment.drop(['age'], axis=1).describe().T
c_describe = df_control.drop(['age'], axis=1).describe().T

t_describe = t_describe[['mean', 'std']]
t_describe.insert(0, 'Covar', covariates)
c_describe = c_describe[['mean', 'std']]

c_describe.insert(0, 'Covar', covariates)


print(t_describe.to_latex(index=False, float_format='%.2f'))
print(c_describe.to_latex(index=False, float_format='%.2f'))


\begin{tabular}{lrr}
\toprule
Covar & mean & std \\
\midrule
size & 289841.24 & 545256.81 \\
contributors & 237.63 & 141.30 \\
watchers & 679.57 & 1106.43 \\
forks & 5028.98 & 9430.21 \\
stargazers & 21250.56 & 37731.74 \\
age_days & 3960.84 & 610.90 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
Covar & mean & std \\
\midrule
size & 139637.36 & 433840.79 \\
contributors & 88.47 & 97.55 \\
watchers & 163.25 & 249.55 \\
forks & 878.33 & 1787.74 \\
stargazers & 3347.32 & 5894.43 \\
age_days & 3697.06 & 664.59 \\
\bottomrule
\end{tabular}



In [220]:
# run t test for covariate balance between treatment and control
from scipy.stats import ttest_ind

test_results = []
for covar in covariates:
    t, p = ttest_ind(df_treatment[covar], df_control[covar], nan_policy='omit')
    test_results.append((covar, t, p))

print(pd.DataFrame(test_results, columns=['Covariate', 't', 'p']).to_latex(index=False, float_format='%.2f'))


\begin{tabular}{lrr}
\toprule
Covariate & t & p \\
\midrule
size & 2.38 & 0.02 \\
contributors & 9.24 & 0.00 \\
watchers & 4.34 & 0.00 \\
forks & 4.10 & 0.00 \\
stargazers & 4.43 & 0.00 \\
age_days & 3.50 & 0.00 \\
\bottomrule
\end{tabular}

