# Development Consistency of Popular Repos

Notes
- Main Dataset https://www.kaggle.com/datasets/johntukey/github-dataset
- Related Paper https://www.diva-portal.org/smash/get/diva2:1114676/FULLTEXT01.pdf

1. Import modules

In [None]:
from datetime import datetime, timedelta, timezone
import math
import matplotlib.pyplot as plt
import pandas as pd
import statistics

2. Load JSON dataset

In [None]:
# paste file directory below
path = "split1.json"

df = pd.read_json( path, lines = True )

3. Add commit list to repos

In [None]:
# function to create a list of repos with their commits
def addCommitListToRepos(df):

    # copy of the dataset used only for iterating through each user's commit lists
    commitLists = df[ [ "commit_list" ] ].copy()

    # only keep commit list
    commitLists = commitLists[ "commit_list" ]

    # add commits to repos
    reposWithCommits = {}
    for currList in commitLists:
        for commit in currList:
            if commit[ "repo_id" ] not in reposWithCommits:
                reposWithCommits[ commit[ "repo_id" ] ] = []
            reposWithCommits[ commit[ "repo_id" ] ].append( commit )

    # return repos with commits for further use in this notebook
    return reposWithCommits

4. Add stars to repos

In [None]:
# function to create a list of repos with their stars
def addStarsToRepos(df):

    # copy of the dataset used only for iterating through each user's repo lists
    repoLists = df[ [ "repo_list" ] ].copy()

    # only keep repo list
    repoLists = repoLists[ "repo_list" ]

    # add stars to repos
    reposWithStars = {}
    for currList in repoLists:
        for repo in currList:
            if repo[ "stargazers_count" ] != 0:
                reposWithStars[ repo[ "id" ] ] = repo[ "stargazers_count" ]

    return reposWithStars

5. Initialize a dictionary for repos with commit lists and repos with stars

In [None]:
reposWithCommits = addCommitListToRepos(df)
reposWithStars = addStarsToRepos(df)

6. Test for proper intialization of reposWithCommits

In [None]:
# Assuming reposWithCommits is a dictionary where keys are repo IDs and values are lists of commits

# Calculate the total number of repos
total_repos = len(reposWithCommits.keys())

# Calculate the total number of commits
numCommits = sum(len(commitList) for commitList in reposWithCommits.values())

starred_repos = len( reposWithStars.keys() )

print( "total repos:", total_repos )
print( "total number of commits:", numCommits )
print( "total number of repos with >= 1 star:", starred_repos )

7. Calculate time between each commit for every repo

In [None]:
def timeBetweenCommits( reposWithCommits ):

    reposWithStdev = {}

    currTimeBetweenList = []
    currTimeList = []

    # iterate through all commits lists in reposWithCommits
    for currID, currCommitList in reposWithCommits.items():
        # ignore repos with only 2 or less commits
        if len( currCommitList ) <= 2:
            continue

        # build list of commit timestamps for every repo
        for commit in currCommitList:
            currTimeList.append( commit[ "commit_at" ] )

        # iterate through all commit timestamps for a repo
        for i in range( len(currTimeList) ):
            currTimeZone = currTimeList[i][-1]
            currTimestamp = datetime.fromisoformat( currTimeList[i][:19] )

            # convert all timestamps to UTC based on their time zone
            if currTimeZone == 'Z':
                 # Z represents no offset meaning commited in UTC time zone
                 currTimestamp = currTimestamp.replace( tzinfo = timezone.utc )
            else:
                # get offset from the last 6 characters in the timestamp
                currTimeZone = currTimeList[i][-6:]

                # parse the offset
                h, m = map( int, currTimeZone.split(":") )
                if h < 0:
                    m *= -1
                currTimeZone = timedelta( hours = h, minutes = m )
                currTimeZone = timezone( currTimeZone )
                currTimestamp = currTimestamp.replace( tzinfo = currTimeZone ).astimezone( timezone.utc )

            currTimeList[i] = currTimestamp

        # sort commit timestamps in ascending order
        currTimeList = sorted( currTimeList )

        # calculate time between each commit
        for i in range( len(currTimeList) - 1 ):
            currTimeBetweenList.append( ( currTimeList[i + 1] - currTimeList[i] ).total_seconds() )

        # calculate standard deviation and append to stdev time list
        currStdev = statistics.stdev( currTimeBetweenList )

        if currStdev != 0:
            reposWithStdev[currID] = pd.to_timedelta( currStdev )

        currTimeList = []
        currTimeBetweenList = []

    return reposWithStdev

8. Initialize Pandas DataFrame of time between commits for every repo

In [None]:
reposWithStdev = timeBetweenCommits( reposWithCommits )

10. Plot difference between average and median of time between commits for every repo

In [None]:
def plotTimeBetweenCommits( reposWithStdev ):

    ypoints = [ td.total_seconds() for td in reposWithStdev.values() ]
    xpoints = [ i for i in range( len(ypoints) ) ]

    print( "Total number of repos with standard deviation > 0:", len(reposWithStdev) )

    plt.title( "Standard deviation of time between commits" )
    plt.xlabel( "Meaningless x points" )
    plt.ylabel( "Standard deviation of time between commits (total seconds)" )

    plt.scatter(xpoints, ypoints)
    plt.show()

plotTimeBetweenCommits( reposWithStdev )

12. Initialize a dictionary of key stars and value difference

In [None]:
def addStarsToStdev( reposWithStars, reposWithStdev ):
    starsWithStdev = {}

    for id, stars in reposWithStars.items():
        if id in reposWithStdev:
            starsWithStdev[stars] = reposWithStdev[id]

    return starsWithStdev

In [None]:
starsWithStdev = addStarsToStdev( reposWithStars, reposWithStdev )

12. Plot correlation between stars and difference

In [None]:
def plotCorrelation( starsWithStdev ):

    starsWithStdev = { k:v for k, v in starsWithStdev.items() if k != 0 and v.total_seconds() != 0 }

    ypoints = [ math.log( td.total_seconds() ) for td in starsWithStdev.values() ]
    xpoints = [ math.log( key ) for key in starsWithStdev.keys() ]

    plt.title( "Correlation between stars and development consistency" )
    plt.xlabel( "Log( Stars for that repo )" )
    plt.ylabel( "Log( Standard deviation of total seconds between commits )" )

    plt.scatter(xpoints, ypoints)
    plt.show()

plotCorrelation( starsWithStdev )