# Replicating Cross-Repo Paper

Unorganized Notes
- Main Dataset https://www.kaggle.com/datasets/johntukey/github-dataset
- Cross-Repo Paper https://ieeexplore.ieee.org/document/8947641
- LSTM Paper https://deeplearning.cs.cmu.edu/S23/document/readings/LSTM.pdf
- LSTM on TensorFlow https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM

1. Import modules

In [None]:
from collections import Counter
import pandas as pd

2. Load JSON dataset

In [None]:
# paste file directory below
path = "dataset.json"

df = pd.read_json( path, lines = True )

3. Add commit list to repos

In [None]:
# function to create a list of repos with their commits
def addCommitListToRepos(df):

    # main dataset only keeps username, user id, number of commits, and commit list
    df = df[ [ "login", "id", "commits", "commit_list" ] ]

    # copy of the dataset used only for iterating through each user's commit lists
    commitLists = df.copy()

    # only keep commit list
    commitLists = commitLists[ "commit_list" ]

    # add commits to repos
    reposWithCommits = {}
    for currList in commitLists:
        for commit in currList:
            if commit[ "repo_id" ] not in reposWithCommits:
                reposWithCommits[ commit[ "repo_id" ] ] = []
            reposWithCommits[ commit[ "repo_id" ] ].append( commit )

    # return repos with commits for further use in this notebook
    return reposWithCommits

4. Initialize a dictionary with keys of repo IDs and values of a list of commits to that repo

In [None]:
reposWithCommits = addCommitListToRepos(df)

4. Test for proper intialization of reposWithCommits

In [None]:
# check for expected number of repos (447111)
print( "Total number of repos:", len( reposWithCommits.keys() ) )

# check for expected number of commits (4544737)
numCommits = 0
for commitList in reposWithCommits.values():
    numCommits = numCommits + len(commitList)

print( "Total number of commits:", numCommits )

# check known repo IDs with ample commits ( 121300003, 88377551, 131508193, 132465788, 132464776 )
repo_id = 121300003

print( reposWithCommits[repo_id] )

5. Initialize a list of the top k most starred and forked repos

In [None]:
def topKFrequent( input, k ):
        input = Counter(input)
        freq = input.most_common(k)

        ret = []
        for i in freq:
            ret.append( i[0] )
        
        return ret