# Development Consistency of Popular Repos

Notes
- Main Dataset https://www.kaggle.com/datasets/johntukey/github-dataset
- Related Paper https://www.diva-portal.org/smash/get/diva2:1114676/FULLTEXT01.pdf

1. Import modules

In [None]:
from datetime import datetime, timedelta, timezone
import json
import pandas as pd
import matplotlib.pyplot as plt
import statistics

2. Load JSON dataset

In [None]:
# paste file directory below
path = "dataset.json"

df = pd.read_json( path, lines = True )

3. Add commit list to repos

In [None]:
# function to create a list of repos with their commits
def addCommitListToRepos(df):

    # copy of the dataset used only for iterating through each user's commit lists
    commitLists = df[ [ "commit_list" ] ].copy()

    # only keep commit list
    commitLists = commitLists[ "commit_list" ]

    # add commits to repos
    reposWithCommits = {}
    for currList in commitLists:
        for commit in currList:
            if commit[ "repo_id" ] not in reposWithCommits:
                reposWithCommits[ commit[ "repo_id" ] ] = []
            reposWithCommits[ commit[ "repo_id" ] ].append( commit )

    # return repos with commits for further use in this notebook
    return reposWithCommits

In [None]:
def printReposWithCommits(reposWithCommits, max_repos=5, max_commits_per_repo=10):
    # Create a shallow copy of the dictionary to avoid mutating the original data
    limitedReposWithCommits = dict(list(reposWithCommits.items())[:max_repos])
    
    # Limit the number of commits per repository
    for repo_id, commits in limitedReposWithCommits.items():
        limitedReposWithCommits[repo_id] = commits[:max_commits_per_repo]

    # Convert the limited dictionary to a JSON string with indentation for readability
    reposWithCommitsJson = json.dumps(limitedReposWithCommits, indent=4)
    print(reposWithCommitsJson)

4. Add stars to repos

In [None]:
# function to create a list of repos with their stars
def addStarsToRepos(df):

    # copy of the dataset used only for iterating through each user's repo lists
    repoLists = df[ [ "repo_list" ] ].copy()

    # only keep repo list
    repoLists = repoLists[ "repo_list" ]

    # add stars to repos
    reposWithStars = {}
    for currList in repoLists:
        for repo in currList:
            reposWithStars[ repo[ "id" ] ] = repo[ "stargazers_count" ]

    return reposWithStars

5. Initialize a dictionary for repos with commit lists and repos with stars

In [None]:
reposWithCommits = addCommitListToRepos(df)
reposWithStars = addStarsToRepos(df)

In [None]:
# Call the function with your data, specifying how many repos and commits per repo to print
printReposWithCommits(reposWithCommits, max_repos=5, max_commits_per_repo=10)
print( reposWithStars[98311519] )

6. Test for proper intialization of reposWithCommits

In [None]:
# Assuming reposWithCommits is a dictionary where keys are repo IDs and values are lists of commits

# Calculate the total number of repos
total_repos = len(reposWithCommits.keys())

# Calculate the total number of commits
numCommits = sum(len(commitList) for commitList in reposWithCommits.values())

# Data for specific repo IDs
repo_ids = [121300003, 88377551, 131508193, 132465788, 132464776]
repos_data = {repo_id: reposWithCommits[repo_id] for repo_id in repo_ids if repo_id in reposWithCommits}

# Structuring the data
data_to_print = {
    "Total number of repos": total_repos,
    "Total number of commits": numCommits,
    "repos": repos_data
}

# Pretty printing the data as JSON
print(json.dumps(data_to_print, indent=4))

7. Calculate time between each commit for every repo

In [None]:
def timeBetweenCommits( reposWithCommits ):

    timeBetweenList =[]
    currTimeBetweenList = []
    currTimeList = []
    avgTimeList = []
    medianTimeList = []

    # iterate through all commits lists in reposWithCommits
    for currCommitList in reposWithCommits.values():
        # ignore repos with only 1 or 0 commits
        if len( currCommitList ) <= 1:
            avgTimeList.append( timedelta() )
            medianTimeList.append( timedelta() )
            timeBetweenList.append( [] )
            continue

        # build list of commit timestamps for every repo
        for commit in currCommitList:
            currTimeList.append( commit[ "commit_at" ] )

        # iterate through all commit timestamps for a repo
        for i in range( len(currTimeList) ):
            currTimeZone = currTimeList[i][-1]
            currTimestamp = datetime.fromisoformat( currTimeList[i][:19] )

            # convert all timestamps to UTC based on their time zone
            if currTimeZone == 'Z':
                 # Z represents no offset meaning commited in UTC time zone
                 currTimestamp = currTimestamp.replace( tzinfo = timezone.utc )
            else:
                # get offset from the last 6 characters in the timestamp
                currTimeZone = currTimeList[i][-6:]

                # parse the offset
                h, m = map( int, currTimeZone.split(":") )
                if h < 0:
                    m *= -1
                currTimeZone = timedelta( hours = h, minutes = m )
                currTimeZone = timezone( currTimeZone )
                currTimestamp = currTimestamp.replace( tzinfo = currTimeZone ).astimezone( timezone.utc )

            currTimeList[i] = currTimestamp

        # sort commit timestamps in ascending order
        currTimeList = sorted( currTimeList )

        # calculate time between each commit
        currAvgTime = timedelta()
        for i in range( len(currTimeList) - 1 ):
            currTimeBetweenList.append( currTimeList[i + 1] - currTimeList[i] )
            currAvgTime = currAvgTime + ( currTimeList[i + 1] - currTimeList[i] )

        # calculate average time and append to average time list
        currAvgTime = currAvgTime / ( len( currCommitList ) - 1 )
        medianTimeList.append( statistics.median(currTimeBetweenList) )
        avgTimeList.append( currAvgTime )
        timeBetweenList.append( currTimeBetweenList )
        currTimeList = []
        currTimeBetweenList = []

    # represent reposWithCommits as Pandas DataFrame
    data = { "repo_id": reposWithCommits.keys(),
             "avg_time_between_commits": avgTimeList,
             "median_time_between_commits": medianTimeList,
             "time_between_commits_list": timeBetweenList }
    data = pd.DataFrame.from_dict( data )

    return data

8. Initialize Pandas DataFrame of time between commits for every repo

In [None]:
reposWithTime = timeBetweenCommits( reposWithCommits )

9. Test for proper initialization of avgTimeBetweenCommits

In [None]:
#           repo_id avg_time_between_commits median_time_between_commits  \
# 0        98311519   0 days 00:01:51.500000      0 days 00:01:51.500000   
# 1       134804722          0 days 00:00:00             0 days 00:00:00   
# 2       134804877          0 days 00:00:00             0 days 00:00:00   
# 3       134804933          0 days 00:00:00             0 days 00:00:00   
# 4        62891737          0 days 00:00:00             0 days 00:00:00   
# ...           ...                      ...                         ...   
# 447106   29204775   3 days 07:44:25.241379             0 days 00:25:41   
# 447107   60782700   0 days 00:23:15.555556             0 days 00:08:38   
# 447108   55134604   1 days 13:25:59.135135             0 days 00:16:37   
# 447109   96882560   9 days 07:47:14.730769             0 days 00:48:55   
# 447110   74912660   1 days 11:31:33.979592             0 days 00:35:57   

#                                 time_between_commits_list  
# 0                                      [0:02:11, 0:01:32]  
# 1                                                      []  
# 2                                                      []  
# 3                                                      []  
# 4                                                      []  
# ...                                                   ...  
# 447106  [42 days, 1:18:49, 0:25:41, 0:00:11, 0:08:02, ...  
# 447107  [0:12:51, 0:15:29, 0:02:35, 0:08:38, 2:23:51, ...  
# 447108  [0:08:10, 19 days, 22:49:41, 0:01:52, 19 days,...  
# 447109  [20 days, 3:38:06, 1 day, 1:08:10, 161 days, 6...  
# 447110  [2:45:10, 3 days, 9:35:36, 0:07:56, 0:02:04, 1...  

# [447111 rows x 4 columns]

avgTime = reposWithTime["avg_time_between_commits"]
midTime = reposWithTime["median_time_between_commits"]

print( reposWithTime )

print( "Repos where the difference between avg and median is perfectly 0:", sum( i == timedelta(0) for i in avgTime - midTime ) )

10. Plot difference between average and median of time between commits for every repo

In [None]:
def plotTimeBetweenCommits( avgTime, midTime ):

    ypoints = [ td.days for td in ( avgTime - midTime ) if td.days != 0 ]
    xpoints = [ i for i in range( len(ypoints) ) ]

    print( "Total number of repos:", len(avgTime) )
    print( "Repos with greater than a 1 day difference between mean and median time between commits:", len(ypoints) )
    print( "Repos negative difference suggesting a development hiatus", len( [ td.days for td in ( avgTime - midTime ) if td.days < 0 ] ) )

    plt.title( "Difference between mean and median time between commits" )
    plt.xlabel( "Meaningless x points" )
    plt.ylabel( "Average minus median time between commits (days)" )

    plt.scatter(xpoints, ypoints)
    plt.show()

plotTimeBetweenCommits( avgTime, midTime )

11. Initialize a dictionary of key repo and value difference

In [None]:
def addDiffToRepos( reposWithTime ):
    reposWithDiff = {}

    for i in range( reposWithTime['repo_id'].count() ):
        reposWithDiff[ reposWithTime['repo_id'][i] ] = reposWithTime['avg_time_between_commits'][i] - reposWithTime['median_time_between_commits'][i]

    return reposWithDiff

In [None]:
reposWithDiff = addDiffToRepos( reposWithTime )

12. Initialize a dictionary of key stars and value difference

In [None]:
def addStarsToDiff( reposWithStars, reposWithDiff ):
    starsWithDiff = {}

    for id, stars in reposWithStars.items():
        if id in reposWithDiff:
            starsWithDiff[stars] = reposWithDiff[id]

    return starsWithDiff

In [None]:
starsWithDiff = addStarsToDiff( reposWithStars, reposWithDiff )

12. Plot correlation between stars and difference

In [None]:
def plotCorrelation( starsWithDiff ):

    ypoints = [ td.days for td in starsWithDiff.values() ]
    xpoints = starsWithDiff.keys() 

    plt.title( "Correlation between stars and development consistency" )
    plt.xlabel( "Stars for that repo" )
    plt.ylabel( "Average minus median time between commits (days)" )

    plt.scatter(xpoints, ypoints)
    plt.show()

plotCorrelation( starsWithDiff )