In [42]:
import pandas as pd
import numpy as np
from github import Auth
from github import Github
import logging
import json
import pickle
import os
import time
import shutil
from tqdm import tqdm

In [43]:
ACCESS_TOKEN = json.load(open("./config"))["access_token"]
REPOSITORY_PATH = "./Data/repository.csv"
github = Github(auth=Auth.Token(ACCESS_TOKEN))
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()


In [44]:
import os
import pandas as pd


# Load or create the DataFrame

def save_repositories(rawRepositoryList):
    df = pd.read_csv(REPOSITORY_PATH, index_col='full_name') if os.path.exists(REPOSITORY_PATH) else  pd.DataFrame(columns=['full_name', 'id', 'name', 'is_fork', 'owner', 'owner_url',
                           'repo_url', 'stars', 'forks', 'watchers', 'language', 'description',
                           'open_issues', 'license_name', 'topics', 'default_branch',
                           'pushed_at', 'created_at', 'updated_at']
                 ).set_index('full_name')
    repo_count = 0  # ✅ Fixed incorrect variable name
    
    new_rows = []  # ✅ Collect new rows to reduce DataFrame updates inside the loop
    full_name_set = set(df.index)

    for repo in rawRepositoryList:
        print(f'Parsing repo {repo_count}')
        r = {
            'id': [int(repo.id)],
            'name': [repo.name],
            'full_name': [repo.full_name],
            'is_fork': [repo.fork],
            'owner': [repo.owner.login],
            'owner_url': [repo.owner.html_url],
            'repo_url': [repo.html_url],
            'stars': [int(repo.stargazers_count)],
            'forks': [int(repo.forks_count)],
            'watchers': [int(repo.watchers_count)],
            'language': [getattr(repo, "language", None)],
            'description': [getattr(repo, "description", None)],
            'open_issues': [int(getattr(repo, "open_issues_count", 0))],
            'license_name': [repo.license.name if repo.license else None],
            'topics': None,
            'default_branch': [repo.default_branch],
            'pushed_at': [repo.pushed_at],
            'created_at': [repo.created_at],
            'updated_at': [repo.updated_at]
        }
        if repo.full_name not in full_name_set:
            r['topics'] = [", ".join(repo.get_topics())]
            new_rows.append(pd.DataFrame(r).set_index('full_name'))
            full_name_set.add(repo.full_name)

        repo_count += 1
    
    if new_rows:
        df = pd.concat([df] + new_rows)

    print(f'Adding.. {len(new_rows)}/{repo_count}')
    df.to_csv(REPOSITORY_PATH)


In [None]:
topics = ["blockchain", "ethereum", "solidity", "web3", "cryptocurrency"]
languages = [
    "Solidity", "Rust", "Go", "JavaScript", "TypeScript", 
    "Python", "C++", "Java", "C#", "Kotlin"
]
sorts = ["stars", "forks", "updated"]
for sort in sorts:
    for topic in topics:
        for language in languages:
            print(f"Fetching Repositories.. topic: {topic} and language: {language} and order by {sort}")
            query = f"topic:{topic} language:{language} stars:>4 forks:>4 is:public"
            repositories = github.search_repositories(query=query, sort=sort, order="desc")
            print(f"Storing Repositories.. topic: {topic} and language: {language} and order by {sort}")
            save_repositories(repositories)
