# Impact analysis of initial commits

## Read dataframe

In [210]:
from impact_analysis.prepare_data import prepare_data

df = prepare_data('./data/final/api_result.json', './data/final/Repo_Commits_JSON.json')

FileNotFoundError: [Errno 2] No such file or directory: '../data/result.json'

In [189]:
print(df.describe())

              stars      releases     forkCount    commitCount      watchers  \
count  12716.000000  12716.000000  12716.000000   12716.000000  12716.000000   
mean      37.833517      0.634869      9.310396    1693.293960      4.818968   
std      557.800379      4.974379    159.876539   22711.029629     41.620977   
min        0.000000      0.000000      0.000000       1.000000      0.000000   
25%        0.000000      0.000000      0.000000       6.000000      1.000000   
50%        0.000000      0.000000      0.000000      17.000000      1.000000   
75%        2.000000      0.000000      1.000000      76.000000      2.000000   
max    33611.000000    245.000000  13395.000000  721223.000000   2351.000000   

          diskUsage  
count  1.271600e+04  
mean   1.412237e+04  
std    9.187170e+04  
min    0.000000e+00  
25%    1.120000e+02  
50%    2.440000e+02  
75%    2.287750e+03  
max    3.705652e+06  


## Delete forks

In [190]:
df_total: int = len(df.index)
df = df[df['isFork']]
df_no_fork: int = len(df.index)
print('Removed ' + str(df_total - df_no_fork) + ' fork repositories.')
print('Repository is now of size: ' + str(df_no_fork))

Removed 11169 fork repositories.
Repository is now of size: 1547


## Apply Formula

In [191]:
from datetime import datetime


def success_formula(series):
    active_timedelta = series.updatedAt - series.createdAt
    commit_value = series.commitCount / 8
    positive = series.stars + series.forkCount + series.watchers + active_timedelta.days + commit_value
    timedelta = datetime.now() - series.updatedAt
    value = positive / (timedelta.days + 1)
    return value

In [192]:
from pandas import Series


def apply_formula(formula, name, dataframe):
    new_row = {}
    for index, row in dataframe.iterrows():
        new_row[index] = formula(row)

    dataframe[name] = Series(new_row)

In [194]:
apply_formula(success_formula, 'successIndex', df)

In [195]:
print(df.head())

          nameWithOwner           createdAt           updatedAt  stars  \
PhE_dask       PhE/dask 2015-09-18 21:43:19 2016-02-11 18:22:48      0   
ccryx_i3       ccryx/i3 2015-12-07 10:58:23 2015-12-07 10:58:25      0   
ATamm_rmp     ATamm/rmp 2016-10-04 21:54:24 2016-11-17 00:22:53      0   
Nify_tidb     Nify/tidb 2017-03-14 11:45:36 2018-09-20 01:13:14      0   
ai5_apery     ai5/apery 2015-03-16 04:51:04 2017-11-13 07:08:12      1   

           releases  isFork  forkCount  commitCount  \
PhE_dask          0    True          0         2693   
ccryx_i3          0    True          0         5296   
ATamm_rmp         0    True          0           92   
Nify_tidb         0    True          0         7297   
ai5_apery         6    True          0          403   

                                                 description  watchers  \
PhE_dask   Task scheduling and blocked algorithms for par...         1   
ccryx_i3          A better tiling and dynamic window manager         1   
AT

## Combine Dataframe with Commit Messages

In [196]:
from json import load

with open('./data/14000Result.json') as f:
    json_commit_msgs = load(f)

In [197]:
commitDict = {}
counter = 0

for entry in json_commit_msgs:
    repo_descriptor = entry['repoName'].partition('/')
    owner = repo_descriptor[0].replace(r'\W', '').replace(r'^\d+', '')
    name = repo_descriptor[2].replace(r'\W', '')
    owner_with_name = owner + '_' + name
    commitDict[owner_with_name] = entry['message']
    
    if owner_with_name not in df.index:
        counter += 1
        
print(counter)

14861


In [198]:
newColumn = Series(commitDict)
df['initialCommitMessage'] = newColumn

In [199]:
print(df.head())

          nameWithOwner           createdAt           updatedAt  stars  \
PhE_dask       PhE/dask 2015-09-18 21:43:19 2016-02-11 18:22:48      0   
ccryx_i3       ccryx/i3 2015-12-07 10:58:23 2015-12-07 10:58:25      0   
ATamm_rmp     ATamm/rmp 2016-10-04 21:54:24 2016-11-17 00:22:53      0   
Nify_tidb     Nify/tidb 2017-03-14 11:45:36 2018-09-20 01:13:14      0   
ai5_apery     ai5/apery 2015-03-16 04:51:04 2017-11-13 07:08:12      1   

           releases  isFork  forkCount  commitCount  \
PhE_dask          0    True          0         2693   
ccryx_i3          0    True          0         5296   
ATamm_rmp         0    True          0           92   
Nify_tidb         0    True          0         7297   
ai5_apery         6    True          0          403   

                                                 description  watchers  \
PhE_dask   Task scheduling and blocked algorithms for par...         1   
ccryx_i3          A better tiling and dynamic window manager         1   
AT

## Analyze Dataframe

In [200]:
# df = df.dropna(how='any', axis=0)
print(len(df[df.initialCommitMessage.notna()].index))
print(len(df.index))

758
1547


In [201]:
conform_df = df[(df['initialCommitMessage'] == 'Initial commit')]

In [202]:
print('Dataframe size: ' + str(len(df.index)))
print('Conform df size: ' + str(len(conform_df.index)))