In [1]:
import pandas as pd
import numpy as np
from github import Auth
from github import Github
import json
import pickle
import os
import time
import shutil
from tqdm import tqdm

In [2]:
ACCESS_TOKEN = json.load(open("./config"))["access_token"]

REPO_METADATA_PATH = "./Data/Repositories_0.csv"

REPOSITORIES_BIN_DATA_PATH = "./Data/Repositories.bin"
ISSUE_DATA_PATH = "./Data/Issues.csv"

BACKUP_PATH = "./Data/Backup"

In [3]:
auth = Auth.Token(ACCESS_TOKEN)
github = Github(auth=auth)

In [4]:
def rate_limited(interval=600):
    last_executed = 0 

    def decorator(func):
        def wrapper(*args, **kwargs):
            nonlocal last_executed
            current_time = time.time()
            if current_time - last_executed >= interval:
                last_executed = current_time
                return func(*args, **kwargs)
            else:
                pass
        return wrapper
    return decorator

@rate_limited(120)
def backup(paths):
    if not os.path.isdir(BACKUP_PATH):
        os.mkdir(BACKUP_PATH)
    for path in paths:
        if os.path.exists(path):
            shutil.copyfile(path, BACKUP_PATH + "/" + os.path.basename(path) + time.strftime("%Y%m%d-%H%M%S"))

class Repository:
    def __init__(self, github_client, repo_url):
        self.github_client = github_client
        self.identifier = self.parse_repo_identifier(repo_url)
        self.issues = []
        self.issue_comments = {}

    def parse_repo_identifier(self, repo_url):
        owner = repo_url.split("/")[-2]
        repo_name = repo_url.split("/")[-1]
    
        return owner + "/" + repo_name
    
    def get_identifier(self):
        return self.identifier

    def collect_info(self):
        self.repo = self.github_client.get_repo(self.identifier)
        self.set_issues(self.repo.get_issues(state="all", labels=["bug"]))

    def set_issues(self, issues):
        self.issues = issues
        for issue in self.issues:
            self.set_comments(issue.number, issue.get_comments())

    def set_comments(self, issue_id, comments):
        self.issue_comments[issue_id] = comments

    def get_issues(self):
        issues = []
        for issue in self.issues:
            issues.append({
                "Repository": self.identifier,
                "IssueId": issue.number,
                "Title": issue.title,
                "Body": issue.body,
                "State": issue.state,
                "Label": ",".join([label.name for label in issue.labels]),
                "CreatedAt": issue.created_at,
                "ClosedAt": issue.closed_at
            })
        return pd.DataFrame(issues)

    ## Not necessary and takes too much time
    def get_comments(self):
        comments = []
        for issue_id in self.issue_comments.keys():
            for comment in self.issue_comments[issue_id]:
                comments.append({
                    "Repository": self.identifier,
                    "IssueId": issue_id,
                    "CommentId": comment.id,
                    "Comment": comment.body
                })
        return pd.DataFrame(comments)

In [5]:
repositories = []

processed_repos = []
if os.path.exists(ISSUE_DATA_PATH):
    processed_repos = pd.read_csv(ISSUE_DATA_PATH)["Repository"].unique().tolist()

df = pd.read_csv(REPO_METADATA_PATH)

for url in tqdm(df["url"].to_list()):
    try:
        repo = Repository(github, url)
        if not repo.get_identifier() in processed_repos:
            repo.collect_info()
            repositories.append(repo)

            if os.path.exists(ISSUE_DATA_PATH):
                issues = pd.read_csv(ISSUE_DATA_PATH)
                issues = pd.concat([issues, repo.get_issues()], ignore_index=True, sort=False)
            else:
                issues = repo.get_issues()
            
            backup([ISSUE_DATA_PATH, REPOSITORIES_BIN_DATA_PATH])

            issues.to_csv(ISSUE_DATA_PATH, index=False)
            pickle.dump(repositories,  open(REPOSITORIES_BIN_DATA_PATH, 'wb'))
    except Exception as e:
        print("Error: ", e)

FileNotFoundError: [Errno 2] No such file or directory: './Data/Repositories_0.csv'