In [15]:
import os
import re
import time
import requests
from dotenv import load_dotenv

# Load environment variables
def load_env():
    load_dotenv()

# Extract API Key from .env file
def get_api_key():
    load_env()
    return os.getenv("MARKET_CAP_API_KEY")

# Function to split a list into chunks
def chunk_array(lst, size):
    return [lst[i:i + size] for i in range(0, len(lst), size)]

# Delay function
def delay(seconds):
    time.sleep(seconds)

# Regex pattern to extract GitHub repository owner and name
GITHUB_REPO_REGEX = r'https://github.com/([^/]+)/([^/]+)'

# Function to fetch cryptocurrencies with repositories
def fetch_cryptocurrencies_with_repositories(symbols):
    chunks = chunk_array(symbols, 100)
    all_cryptos_with_repos = []
    api_key = get_api_key()
    max_retries = 3
    delay_duration = 60

    for chunk in chunks:
        print(f"Fetching metadata for: {', '.join(chunk)}")
        attempts = 0

        while attempts < max_retries:
            try:
                response = requests.get(
                    'https://pro-api.coinmarketcap.com/v2/cryptocurrency/info',
                    headers={'X-CMC_PRO_API_KEY': api_key},
                    params={'symbol': ','.join(chunk), 'aux': 'urls'}
                )

                if response.status_code == 200:
                    # Check if the 'data' key contains the expected structure
                    cryptos_info = []
                    data = response.json().get('data', {})
                    for crypto in data.values():  # Iterate over the values in the 'data' dictionary
                        for crypto_info in crypto:
                            # Extract GitHub repository info from the source_code URLs
                            source_code = [
                                url for url in crypto_info.get('urls', {}).get('source_code', [])
                                if url and 'github.com' in url
                            ]
                            
                            # Extract owner and repo name from the GitHub URLs using regex
                            repos = []
                            for url in source_code:
                                match = re.search(GITHUB_REPO_REGEX, url)
                                if match:
                                    owner, repo = match.groups()
                                    repos.append({'owner': owner, 'repo': repo})
                            
                            # Store the cryptocurrency information along with GitHub repositories
                            cryptos_info.append({
                                'name': crypto_info['name'],
                                'symbol': crypto_info['symbol'],
                                'repositories': repos,  # Add the extracted repos here
                            })

                    all_cryptos_with_repos.extend(cryptos_info)
                    break
                else:
                    attempts += 1
                    if response.status_code == 429:
                        print(f"Rate limit exceeded. Attempt {attempts} of {max_retries}. Waiting before retrying...")
                        delay(delay_duration)
                    else:
                        break
            except requests.exceptions.RequestException as error:
                print(f"Error message: {error}")
                break

    print(f"Total cryptocurrencies with repositories: {len(all_cryptos_with_repos)}")
    return all_cryptos_with_repos

# Function to fetch all cryptocurrency symbols
def fetch_all_cryptocurrency_symbols():
    all_symbols = []
    start_index = 1
    limit = 5000
    api_key = get_api_key()

    while True:
        try:
            response = requests.get(
                'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest',
                headers={'X-CMC_PRO_API_KEY': api_key},
                params={'limit': limit, 'start': start_index}
            )

            if response.status_code == 200 and response.json().get('data'):
                symbols = [crypto['symbol'] for crypto in response.json()['data']]
                all_symbols.extend(symbols)
                print(f"Fetched symbols: {len(symbols)}")
                start_index += limit
            else:
                break
        except requests.exceptions.RequestException as error:
            print(f"Error fetching cryptocurrency listings: {error}")
            if response.status_code == 429:
                print("Rate limit exceeded. Waiting before retrying...")
                delay(10)
            break

    print(f"Total cryptocurrency symbols: {len(all_symbols)}")
    print(f"Sample symbols: {all_symbols[:5]}")
    return all_symbols

In [2]:
# Fetch all symbols first
symbols = fetch_all_cryptocurrency_symbols()

# Fetch cryptocurrencies with their repositories
cryptos_with_repos = fetch_cryptocurrencies_with_repositories(symbols)

# Optionally, print some of the results
print(cryptos_with_repos[:5])  

Fetched symbols: 5000
Fetched symbols: 5000
Fetched symbols: 156
Total cryptocurrency symbols: 10156
Sample symbols: ['BTC', 'ETH', 'USDT', 'SOL', 'BNB']
Fetching metadata for: BTC, ETH, USDT, SOL, BNB, XRP, DOGE, USDC, ADA, TRX, AVAX, SHIB, TON, SUI, LINK, DOT, BCH, XLM, PEPE, LEO, NEAR, LTC, APT, UNI, DAI, HBAR, CRO, ICP, ETC, POL, KAS, RENDER, BONK, TAO, OM, ARB, WIF, FET, STX, XMR, MNT, ATOM, FIL, OKB, VET, OP, AAVE, INJ, FLOKI, TIA, IMX, FDUSD, GRT, BGB, ALGO, SEI, FTM, RUNE, RAY, ENA, THETA, BRETT, JUP, WLD, MKR, PYTH, POPCAT, ONDO, KCS, BSV, AR, FLR, BTT, XTZ, EOS, PNUT, FLOW, MATIC, LDO, QNT, STRK, BEAM, GALA, MOG, JASMY, KAIA, NEO, AXS, AKT, HNT, GT, AERO, CORE, XEC, GOAT, MEW, APE, SAND, NEXO, MANA
Fetching metadata for: EGLD, PENDLE, DYDX, AIOZ, MINA, FTT, ORDI, USDD, ZEC, NEIRO, NOT, CFX, XDC, CHZ, GNO, XAUt, IOTA, ENS, W, BOME, LUNC, ROSE, AXL, SUPER, SNX, CKB, BTG, CAKE, ZK, CRV, GMT, PYUSD, PAXG, RON, BLUR, TUSD, 1000SATS, KAVA, ASTR, TURBO, SAFE, NFT, COMP, DEXE, JTO, E

In [3]:
# Save the results to a csv file
with open('cryptos_with_repos.csv', 'w') as file:
    file.write('name,symbol,owner,repo\n')
    
    # Usar um conjunto para rastrear entradas únicas
    seen_entries = set()
    
    for crypto in cryptos_with_repos:
        for repo in crypto['repositories']:
            # Criar uma entrada única como tupla
            entry = (crypto['name'], crypto['symbol'], repo['owner'], repo['repo'])
            
            # Verificar se a entrada já foi registrada
            if entry not in seen_entries:
                seen_entries.add(entry)  # Adicionar ao conjunto
                file.write(f"{entry[0]},{entry[1]},{entry[2]},{entry[3]}\n")


In [None]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import time
import json

# Load environment variables (GitHub token should be set in the .env file)
def load_env():
    load_dotenv()

def get_github_token():
    load_env()
    return os.getenv("GH_TOKEN")

def fetch_issues_count(owner, repo):
    """
    Fetch the total number of issues (open and closed) for a given repository.

    Args:
        owner (str): Repository owner.
        repo (str): Repository name.

    Returns:
        int: Total number of issues, or -1 if an error occurs.
    """
    github_token = get_github_token()
    headers = {'Authorization': f'token {github_token}'}
    
    def get_issue_count(state):
        url = f"https://api.github.com/search/issues?q=repo:{owner}/{repo}+type:issue+state:{state}"
        response = requests.get(url, headers=headers)
        
        # Handle rate limit error (403)
        if response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers:
            remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
            if remaining == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
                current_time = int(time.time())
                wait_time = max(0, reset_time - current_time)  # Ensure non-negative wait time
                if wait_time > 0:
                    print(f"Rate limit exceeded. Waiting {wait_time} seconds...")
                    time.sleep(wait_time + 1)  # Wait for the rate limit to reset
                else:
                    print("Rate limit reset time already passed. Retrying immediately...")
                return get_issue_count(state)  # Retry after waiting

        response.raise_for_status()
        return response.json().get('total_count', 0)
    
    try:
        open_issues_count = get_issue_count('open')
        closed_issues_count = get_issue_count('closed')
        total_issues = open_issues_count + closed_issues_count

        print(f"Repository: {owner}/{repo}, Open Issues: {open_issues_count}, Closed Issues: {closed_issues_count}, Total Issues: {total_issues}")
        return total_issues
    except requests.exceptions.HTTPError as http_err:
        if http_err.response.status_code == 422:
            print(f"Error: Repository {owner}/{repo} might be invalid or private.")
        elif http_err.response.status_code == 403:
            print(f"Error: Access denied for repository {owner}/{repo}. Check permissions.")
        else:
            print(f"HTTP error fetching issues for {owner}/{repo}: {http_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error fetching issues for {owner}/{repo}: {req_err}")
    return -1

def filter_repositories_by_issues(csv_file_path, issue_threshold=500):
    """
    Filter repositories from a CSV file that have more than a specified number of issues.

    Args:
        csv_file_path (str): Path to the CSV file containing repository data.
        issue_threshold (int): Minimum number of issues to filter repositories.

    Returns:
        pd.DataFrame: DataFrame containing filtered repositories.
    """
    try:
        if not os.path.exists(csv_file_path):
            raise FileNotFoundError(f"File '{csv_file_path}' not found.")
        
        try:
            df = pd.read_csv(csv_file_path, encoding='utf-8', on_bad_lines='skip')
        except UnicodeDecodeError:
            print("UTF-8 decoding failed. Retrying with 'latin1' encoding.")
            df = pd.read_csv(csv_file_path, encoding='latin1', on_bad_lines='skip')

        if 'owner' not in df.columns or 'repo' not in df.columns:
            raise ValueError("CSV does not have the required columns: 'owner', 'repo'")
        
        filtered_repositories = []

        for _, row in df.iterrows():
            owner, repo = row['owner'], row['repo']
            total_issues = fetch_issues_count(owner, repo)
            print(f"Processing coin: {row.get('name', 'N/A')}, {row.get('symbol', 'N/A')}.")
            if total_issues > issue_threshold:
                filtered_repositories.append({
                    'name': row.get('name', 'N/A'),
                    'symbol': row.get('symbol', 'N/A'),
                    'owner': owner,
                    'repo': repo,
                    'total_issues': total_issues
                })
        
        filtered_df = pd.DataFrame(filtered_repositories)
        filtered_df.to_csv('filtered_repositories_over_200_issues.csv', index=False)
        print("Filtered repositories saved to 'filtered_repositories_over_200_issues.csv'.")
        return filtered_df
    except Exception as e:
        print(f"Error processing the file: {e}")
        return pd.DataFrame()

if __name__ == "__main__":
    # Example: Fetch issues count for a single repository
    #print(fetch_issues_count('safe-global', 'safe-token'))

    # Example: Filter repositories from a CSV file
    # Uncomment and provide the path to your CSV file
    csv_file_path = 'cryptos_with_repos.csv'
    filtered_repositories = filter_repositories_by_issues(csv_file_path)
    if not filtered_repositories.empty:
        print(f"Total repositories with more than 200 issues: {len(filtered_repositories)}")
    else:
        print("No repositories processed or filtered.")



UTF-8 decoding failed. Retrying with 'latin1' encoding.
Repository: ArweaveTeam/arweave, Open Issues: 67, Closed Issues: 150, Total Issues: 217
Processing coin: Arweave, AR.
Repository: ARbitcoin/ARbit, Open Issues: 0, Closed Issues: 0, Total Issues: 0
Processing coin: ARbit, ARB.
Repository: ava-labs/avalanchego, Open Issues: 143, Closed Issues: 652, Total Issues: 795
Processing coin: Avalanche, AVAX.
Repository: BeamMW/beam, Open Issues: 234, Closed Issues: 1519, Total Issues: 1753
Processing coin: Beam, BEAM.
Repository: bitcoin-sv/bitcoin-sv, Open Issues: 4, Closed Issues: 247, Total Issues: 251
Processing coin: Bitcoin SV, BSV.
Repository: bitcoin/bitcoin, Open Issues: 374, Closed Issues: 7904, Total Issues: 8278
Processing coin: Bitcoin, BTC.
Repository: ContractChecker/audits, Open Issues: 2, Closed Issues: 1, Total Issues: 3
Processing coin: Battle In Verse, BTT.
Repository: dogecoin/dogecoin, Open Issues: 160, Closed Issues: 1126, Total Issues: 1286
Processing coin: Dogecoin, 

In [None]:
# read the csv and group by owner/repo to see if the same repo is listed multiple times
import pandas as pd
df = pd.read_csv('filtered_repositories_over_200_issues.csv')
grouped = df.groupby(['owner', 'repo']).size().reset_index(name='count')
duplicates = grouped[grouped['count'] > 1]
print(duplicates)

             owner           repo  count
26        celo-org  celo-monorepo      2
30        ethereum    go-ethereum      2
47   oasisprotocol     oasis-core      2
53      paritytech   polkadot-sdk      2
63         steemit          steem      2
67  ton-blockchain           TEPs      2
69     trustwallet         assets      2


In [None]:
import pandas as pd
import requests
import os
import time

# Read your CSV data
df = pd.read_csv('filtered_repositories_over_200_issues.csv')

# Set your CoinMarketCap API key
api_key = os.getenv('MARKET_CAP_API_KEY')

# Function to get CoinMarketCap data (market cap or market rank) with retry mechanism
def get_market_cap_data(coin_symbol):
    url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/map'
    params = {
        'symbol': coin_symbol,  # Use the coin symbol to search for it
        'limit': 1,  # Limit to 1 result (we only care about the top result)
    }
    headers = {
        'X-CMC_PRO_API_KEY': api_key,
        'Accept': 'application/json'
    }
    
    max_retries = 5
    backoff_time = 10  # Initial backoff time in seconds

    for attempt in range(max_retries):
        response = requests.get(url, params=params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            if data.get('data'):
                print(f"Coin: {coin_symbol}, Market Cap: {data['data'][0]['rank']}")
                return data['data'][0]['rank']  # Or 'cmc_rank' for market rank
            else:
                print(f"Coin: {coin_symbol}, Market Cap: Not found.")
                return None
        elif response.status_code == 429:
            print(f"Rate limit hit. Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponential backoff
        else:
            print(f"Error for coin {coin_symbol}: {response.status_code}")
            break  # Exit the loop for non-retryable errors
    
    return None

# Adding the market cap data to the DataFrame
df['market_cap'] = df['symbol'].apply(get_market_cap_data)

# Filter out rows where market_cap is None (in case the coin wasn't found on CoinMarketCap)
df = df[df['market_cap'].notna()]

# Sort by market cap (descending) and total_issues (descending)
df_sorted = df.sort_values(by=['market_cap', 'total_issues'], ascending=[False, False])

# Drop duplicates and keep the best-ranked coin for each 'owner/repo'
unique_repositories = df_sorted.drop_duplicates(['owner', 'repo'])

# Save the filtered repositories to a new CSV file
unique_repositories.to_csv('filtered_repositories_over_200_issues_unique.csv', index=False)

# Print confirmation message and the count of unique repositories
print(f"Unique repositories saved to 'filtered_repositories_over_200_issues_unique.csv'.")
print(f"Total unique repositories with more than 200 issues: {len(unique_repositories)}")

Coin: AR, Market Cap: 70
Coin: AVAX, Market Cap: 10
Coin: BEAM, Market Cap: 83
Coin: BSV, Market Cap: 72
Coin: BTC, Market Cap: 1
Coin: DOGE, Market Cap: 7
Coin: DOT, Market Cap: 15
Coin: EOS, Market Cap: 81
Coin: ETC, Market Cap: 28
Coin: ETH, Market Cap: 2
Coin: LINK, Market Cap: 16
Coin: LTC, Market Cap: 23
Coin: NEO, Market Cap: 91
Coin: TAO, Market Cap: 32
Coin: TRX, Market Cap: 11
Coin: XLM, Market Cap: 14
Coin: XMR, Market Cap: 51
Coin: ASTR, Market Cap: 136
Coin: BAT, Market Cap: 168
Coin: BTG, Market Cap: 132
Coin: CELO, Market Cap: 162
Coin: DASH, Market Cap: 169
Coin: HOT, Market Cap: 153
Coin: IOTX, Market Cap: 170
Coin: KSM, Market Cap: 138
Coin: MASK, Market Cap: 196
Coin: MASK, Market Cap: 196
Coin: MINA, Market Cap: 101
Coin: OSMO, Market Cap: 180
Coin: QTUM, Market Cap: 182
Rate limit hit. Retrying in 10 seconds...
Rate limit hit. Retrying in 20 seconds...
Rate limit hit. Retrying in 40 seconds...
Coin: ROSE, Market Cap: 123
Coin: RVN, Market Cap: 193
Coin: TRAC, Marke

In [22]:
if(len(df)-len(unique_repositories) == len(duplicates)):
    print("The number of duplicates matches the difference between the original and unique repositories.")
else:
    print("The number of duplicates does not match the difference between the original and unique repositories.")

The number of duplicates matches the difference between the original and unique repositories.
