In [1]:
import pandas as pd
import numpy as np
from github import Auth
from github import Github
import logging
import json
import pickle
import os
import time
import shutil
from tqdm import tqdm

In [2]:
ACCESS_TOKEN = json.load(open("./config"))["access_token"]
REPOSITORY_PATH = "./Data/repository.csv"
github = Github(auth=Auth.Token(ACCESS_TOKEN))
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()


In [None]:
import os
import pandas as pd


# Load or create the DataFrame

def save_repositories(rawRepositoryList):
    df = pd.read_csv(REPOSITORY_PATH, index_col='full_name') if os.path.exists(REPOSITORY_PATH) else  pd.DataFrame(columns=['full_name', 'id', 'name', 'is_fork', 'owner', 'owner_url',
                           'repo_url', 'stars', 'forks', 'watchers', 'language', 'description',
                           'open_issues', 'license_name', 'topics', 'default_branch',
                           'pushed_at', 'created_at', 'updated_at', 'closed_issues']
                 ).set_index('full_name')
    repo_count = 0  # ✅ Fixed incorrect variable name
    
    new_rows = []  # ✅ Collect new rows to reduce DataFrame updates inside the loop
    full_name_set = set(df.index)

    for repo in rawRepositoryList:
        #print(f'Parsing repo {repo_count}')
        r = {
            'id': [int(repo.id)],
            'name': [repo.name],
            'full_name': [repo.full_name],
            'is_fork': [repo.fork],
            'owner': [repo.owner.login],
            'owner_url': [repo.owner.html_url],
            'repo_url': [repo.html_url],
            'stars': [int(repo.stargazers_count)],
            'forks': [int(repo.forks_count)],
            'watchers': [int(repo.watchers_count)],
            'language': [getattr(repo, "language", None)],
            'description': [getattr(repo, "description", None)],
            'open_issues': [int(getattr(repo, "open_issues_count", 0))],
            'license_name': [repo.license.name if repo.license else None],
            'topics': None,
            'default_branch': [repo.default_branch],
            'pushed_at': [repo.pushed_at],
            'created_at': [repo.created_at],
            'updated_at': [repo.updated_at]
        }
        if repo.full_name not in full_name_set:
            r['topics'] = [", ".join(repo.get_topics())]
            new_rows.append(pd.DataFrame(r).set_index('full_name'))
            full_name_set.add(repo.full_name)

        repo_count += 1
    
    if new_rows:
        df = pd.concat([df] + new_rows)

    print(f'Adding.. {len(new_rows)}/{repo_count}')
    df.to_csv(REPOSITORY_PATH)


In [None]:

# key_words = [   # Crypto Coin
#     "bitcoin",
#     "ethereum",
#     "xrp",
#     "tether",
#     "bnb",
#     "solana",
#     "usdc",
#     "dogecoin",
#     "cardano",
#     "tron",
#     "chainlink",
#     "sui",
#     "stellar",
#     "litecoin",
#     "unus-sed-leo",
#     "toncoin",
#     "hedera",
#     "hyperliquid",
#     "polkadot",
#     "bitcoin-cash",
#     "ethena-usde",
#     "bitget-token",
#     "dai",
#     "uniswap",
#     "monero",
#     "near-protocol",
#     "pepe",
#     "bittensor",
#     "aave",
#     "aptos",
#     "ethereum-classic",
#     "okb",
#     "kaspa",
#     "vechain",
#     "pol-prev-matic",  
#     "sonic-prev-ftm",
#     "algorand",
#     "filecoin",
#     "first-digital-usd",
#     "gatetoken",
#     "kucoin-token",
#     "lido-dao",
#     "ethena",
#     "xdc-network",
#     "worldcoin",
#     "sei",
#     "jasmycoin",
#     "ethereum-name-service",
#     "jito",
#     "floki",
#     "tezos",
#     "nexo",
#     "berachain",
#     "iota",
#     "neo",
#     "tether-gold",
#     "bitcoin-sv",
#     "spx6900",
#     "dogwifhat",

#     #permissioned Blockchain
#     "hyperledger-fabric",
#     "hyperledger-sawtooth",
#     "hyperledger-iroha",
#     "hyperledger-besu",
#     "corda",
#     "quorum",
#     "multichain",
#     "enterprise-blockchain",
#     "consortium-blockchain",
#     "ibm-blockchain",

#     #Popular Blockchain keywords
#     "blockchain",
#     "smart-contracts",
#     "solidity",
#     "dapp",
#     "ethereum",
#     "web3",
#     "cryptocurrency",
#     "defi",
#     "nft",
#     "distributed-ledger",
#     "decentralized-network"
#   ]

#dappradar
key_words = [
    "KAI-CHING",
    "World of Dypians",
    "UXUY",
    "Jupiter Exchange",
    "Pixudi",
    "Tevi",
    "Age of Dino",
    "SERAPH: In The Darkness",
    "Chingari",
    "Revox | ReadON",
    "KGeN",
    "Alliance Games",
    "Alaya AI",
    "Dmail Network",
    "Moonveil",
    "Alien Worlds",
    "PlayEmber",
    "Sweat Economy",
    "ZetaHub",
    "ChainArena - PentagonGames EXP",
    "Axie Infinity",
    "BoomLand",
    "GombleGames",
    "Evermoon",
    "Nine Chronicles",
    "Slime Revolution",
    "The Lost Glitches",
    "Immortal Rising 2",
    "Yuliverse",
    "Apeiron",
    "AtomicAssets",
    "Growfitter",
    "Redbrick",
    "KeitoKun",
    "PancakeSwap V2",
    "Jumper Exchange",
    "Super Champs HQ",
    "Karat Galaxy",
    "FishWar",
    "Harvest Moon ~ Meteor Wallet",
    "OpenPad AI",
    "IN - match3",
    "Yomi Block Puzzle",
    "SendingMe",
    "ERAGON",
    "1inch Network",
    "QORPO WORLD",
    "PancakeSwap V3",
    "motoDEX",
    "Sol Incinerator",
    "QuickSwap",
    "Dragon Slither",
    "UneMeta",
    "Galxe",
    "SuperWalk",
    "GOAT Gaming",
    "Taco Studios",
    "OpenChat",
    "Don't FOMO",
    "MomoAI(MetaOasis)",
    "SpinCity",
    "IceCreamSwap",
    "Sunflower Land",
    "NFPrompt",
    "Piratopia",
    "Flappy Core",
    "Stargate",
    "HunnyPlay",
    "xPortal",
    "WORLD3",
    "Unipoly Lottery & Web3 Games",
    "Wild West Shooting",
    "Core Tetris",
    "NFTMining - By Pentagon Games",
    "Archer Hunter",
    "PLAYZAP GAMES"
]

# languages = [
#     "Solidity", "Rust", "Go", "JavaScript", "TypeScript", 
#     "Python", "C++", "Java", "C#", "Kotlin", ""
# ]
languages = [ ""]

for key in set(key_words):
    for language in languages:
      query_language = f'language:{language}' if language else ''
      #query = f"topic:{key} archived:false created:>2021-03-06 pushed:>2024-02-21 size:>1 stars:>4 forks:>4 is:public"
      query = f"(topic:{key} OR {key} in:name OR {key} in:description OR {key} in:readme) {query_language} archived:false created:>2021-03-06 pushed:>2024-02-21 size:>0 stars:>4 forks:>4 is:public"
      print(f"Fetching Repositories..  {query}")
      repositories = github.search_repositories(query=query, sort='stars', order="desc")
      print(f"Storing Repositories..  {query}")
      save_repositories(repositories)


In [None]:
# import os
# df = pd.read_csv(REPOSITORY_PATH, index_col='full_name')
# i = 0
# for index, row in df.iterrows():
#     if i % 10 == 0:
#             print(f'fetching issues for {index} : {i+1}')
#     if 'closed_issues' not in df.columns or  pd.isna(df.at[index, 'closed_issues']):
#         repo = github.get_repo(row['id'])
#         closed_issues = repo.get_issues(state='closed')
#         closed_issue_count = closed_issues.totalCount
#         #closed_issue_count = sum(1 for issue in closed_issues if not issue.pull_request)
#         df.at[index, 'closed_issues'] = int(closed_issue_count)
#         if i % 10 == 0:
#             df.to_csv(REPOSITORY_PATH)
#     i += 1
# df.to_csv(REPOSITORY_PATH)

In [None]:
# df = pd.read_csv(REPOSITORY_PATH, index_col='full_name')
# df['closed_issues'] = df['closed_issues'].astype(int)
# df.to_csv(REPOSITORY_PATH)

In [None]:

# df = pd.read_csv(REPOSITORY_PATH, index_col='full_name')


# df = df.sample(frac=1, random_state=42)

# split_ratio = 0.5
# split_index = int(len(df) * split_ratio)

# df1 = df[:split_index]
# df2 = df[split_index:]

# # Save to new CSV files
# df1.to_csv("./Data/repository-part1.csv")
# df2.to_csv("./Data/repository-part2.csv")



In [None]:
# import os
# READ_ME_PATH = "../bc-readme"
# os.makedirs(READ_ME_PATH, exist_ok=True)
# df = pd.read_csv(REPOSITORY_PATH, index_col='full_name')
# for index, row in df.iterrows():
#     repo = github.get_repo(row['id'])
#     readme = repo.get_readme()
#     readme_content = readme.decoded_content.decode("utf-8")
#     closed_issues = repo.get_issues(state='closed')
#     closed_issue_count = sum(1 for issue in closed_issues if not issue.pull_request)
#     df.at[index, 'closed_issues'] = int(closed_issue_count)
#     df.at[index, 'readme'] = readme_content
#     with open(READ_ME_PATH + '/' + index.replace('/', '--') + '--' +  readme.name, "w") as readme_file:
#         readme_file.write(readme_content)
# df.to_csv('./Data/repositoryv2.csv')