In [47]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException
import numpy as np
import pandas as pd
from tqdm import tqdm
import time

In [48]:
def check_url(url):  # Function to check currency link
    if url == "?":  # When the currency doesn't have github link
        return -1
    if (url.count("/") == 4) & (
        url[-1] != "/"):  # When the currency has github repository link
        return 1
    if (url.count("/") == 3) | (
        (url.count("/") == 4) & (url[-1] == "/")):  # When the currency has github project link
        return
    return 0  # When the currency has an other link

In [49]:
def get_page(url): # function to send request and return parsed page
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup
    except HTTPError as http_err:
        print("HTTP error occurred: {}".format(http_err))
    except ConnectionError as conn_err:
        print("Connection error occurred: {}".format(conn_err))
    except Timeout as timeout_err:
        print("Timeout error occurred: {}".format(timeout_err))
    except RequestException as req_err:
        print("Request error occurred: {}".format(req_err))
    return None

In [50]:
def scrape_repo(url, id):  # function to scrape repository links and return data
    try:
        page = get_page(url)
        boxs = page.select(".text-small.mr-3")
        languages = []
        for box in boxs:
            language = {}
            elements = box.select("span")
            language["coin_id"] = id
            language["name"] = elements[0].text
            language["percentage"] = float(elements[1].text[:-1])
            languages.append(language)
        data = {}
        data["coin_id"] = id
        commit_element = page.select(".ml-md-3 strong")
        data["commits_count"] = int(
            "".join(filter(str.isdigit, commit_element[0].text))
        )
        contrib_element = page.select(".Link.flex-items-center")
        for element in contrib_element:
            if "Contributors" in element.text:
                data["contributors_count"] = int(
                    "".join(filter(str.isdigit, element.text))
                )
        fork_count = page.find(id="repo-network-counter").get("title")
        data["forks_count"] = int("".join(filter(str.isdigit, fork_count)))
        star_count = page.find(id="repo-stars-counter-star").get("title")
        data["stars_count"] = int("".join(filter(str.isdigit, star_count)))
        return [languages, data]
    except:
        print("error in scraping {}th coin.".format(id))
    return None

In [51]:
# Function to scrape projects links(finding repository link of currency and scrape it)
def scrape_prjct(url, id):
    try:
        page = get_page(url)
        repo_url = "https://github.com" + page.select(".col-lg-6:nth-child(1) a")[
            0
        ].get("href")
        return scrape_repo(repo_url, id)
    except:
        print("error in finding {}th coin link.".format(id))
    return None

In [52]:
def scrape(url, id):  # ّ Function that calls the appropriate function for each currency
    case = check_url(url)
    if case == -1:  # When the currency doesn't have github link
        return None
    if case == 1:  # When the currency has github repository link
        return scrape_repo(url, id)
    if (
        case == 0
    ):  # When the currency has another link and its project link should be separated
        parts = url.split("/")
        url = "/".join(parts[:4])
    return scrape_prjct(url, id)  # When the currency has github project link

In [55]:
# Reading data and fix some links
coin_data = pd.read_csv("coins_extras.csv")
coin_data.replace(pd.NA, "?", inplace=True)
value_mapping = {
    "https://github.com/solana-labs": "https://github.com/solana-labs/solana",
    "https://github.com/maticnetwork/whitepaper/": "https://github.com/maticnetwork/bor",
    "https://github.com/filecoin-project/": "https://github.com/filecoin-project/lotus",
    "https://github.com/quantnetwork/": "?",
    "https://github.com/chainsulting/Smart-Contract-Security-Audits/tree/master/ApeCoin": "?",
    "https://github.com/thorchain/Resources/tree/master/Whitepapers/THORChain/whitepaper-en.md": "https://github.com/thorchain/THORChain-v1",
    "https://github.com/iotaledger": "https://github.com/iotaledger/wasp",
    "https://github.com/curvefi/curve-contract": "https://github.com/curvefi/curve-stablecoin",
    "https://github.com/gatechain": "https://github.com/gatechain/crypto",
    "https://github.com/trustwallet": "https://github.com/trustwallet/wallet-core",
    "https://github.com/dydxfoundation/": "?",
    "https://github.com/BTCGPU/BTCGPU/wiki/Technical-Spec": "https://github.com/BTCGPU/BTCGPU",
    "https://github.com/singnet/": "https://github.com/singnet/snet-daemon",
    "https://github.com/balancer-labs": "?",
    "https://github.com/kusamanetwork": "?",
    "https://github.com/worldcoin": "?",
    "https://github.com/Solar-network": "https://github.com/Solar-network/core",
    "https://github.com/axelarnetwork": "https://github.com/axelarnetwork/axelar-core",
    "https://github.com/terra-project": "https://github.com/terra-money/core",
    "https://github.com/nervosnetwork/rfcs/blob/master/rfcs/0002-ckb/0002-ckb.md": "https://github.com/nervosnetwork/ckb",
    "https://github.com/UMAprotocol/whitepaper": "?",
    "https://github.com/reserve-protocol": "https://github.com/reserve-protocol/rsv-v2",
    "https://github.com/kybernetwork": "?",
}
coin_data["github_link"] = coin_data["github_link"].replace(value_mapping)

In [54]:
languages_data = []
github_data = []
for i in tqdm(range(0, 200)):  # Iterating over all currencies and scrape data of them
    extra_data = scrape(coin_data["github_link"][i], coin_data["rank"][i])
    if extra_data != None:
        languages_data.extend(extra_data[0])
        github_data.append(extra_data[1])
    if (i + 1) % 10 == 0:
        time.sleep(10)

lang_df = pd.DataFrame(languages_data)
git_df = pd.DataFrame(github_data)
lang_df.to_csv("languages_data.csv", index=False)
git_df.to_csv("github_data.csv", index=False)

100%|██████████| 200/200 [08:41<00:00,  2.61s/it]
