# Libraries

In [1]:
import os
import time
import json
import logging
import pandas as pd
from tqdm import tqdm
from random import randint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, NoSuchAttributeException, ElementNotSelectableException, NoSuchDriverException

# Scraping currencies information on 2023/08/25 


In [5]:
def scrape_coins_data():
    driver = webdriver.Firefox()
    try:
        driver.get("https://coinmarketcap.com/historical/20230825/")
    except NoSuchDriverException as err:
        print("Error occurred: {}".format(err))
    time.sleep(1)
    scroll_page(driver)
    coins = get_coin_data(driver)
    driver.quit()
    with open("currency_data.json", "w") as file:
        json.dump(coins, file)
    df = pd.read_json("currency_data.json")
    df.to_csv(("currency_data.csv", index=False))
    return df

#### Scrolling Main Page

In [2]:
def scroll_page(driver):
    pos = 500
    for _ in range(0, 20):
        driver.execute_script("window.scrollTo(0, " + str(pos) + ")")
        pos += 500
        time.sleep(1)

#### Get Currencies data function

In [3]:
def get_coin_data(driver):
    rows = driver.find_elements(By.CSS_SELECTOR, ".cmc-table-row")
    currencies = []
    for row in rows:
        try:
            currency = {}
            currency["rank"] = row.find_element(By.CSS_SELECTOR, ".cmc-table__cell--sort-by__rank div").text
            element = row.find_element(By.CSS_SELECTOR, ".cmc-table__column-name--name")
            currency["name"] = element.get_attribute("title")
            currency["symbol"] = row.find_element(By.CSS_SELECTOR, ".cmc-table__column-name--symbol").get_attribute("textContent")
            currency["circulating_supply"] = int("".join(filter(str.isdigit, row.find_element(By.CSS_SELECTOR, ".cmc-table__cell--sort-by__circulating-supply div").text)))
            currency["main_link"] = element.get_attribute("href")
            currency["historical_link"] = coin["main_link"] + "historical-data/"
            currencies.append(currency)
        except (NoSuchElementException, NoSuchAttributeException, ElementNotSelectableException) as err:
            print("Error occurred: {}".format(err))
    return currencies

# Scraping Currencies Extra Information From Main Pages

In [6]:
def scrape_coins_extras(df):
    git = webdriver.Firefox()
    extras_list = []
    coins = df.to_dict("records")
    for ex in coins:
        extras = get_coin_extras(git, ex["main_link"])
        extras_list.append(extras)
    git.quit()
    df_extra = pd.DataFrame(extras_list)
    df_extra['currency_id'] = [ex['rank'] for ex in coins]
    new_cols = ["currency_id", "github_link", "tags"]
    df_extra = df_extra[new_cols]
    return df_extra

df_main = scrape_coins_data()
df_extra = scrape_coins_extras(df_main)

# Get Currencies Extra data function

In [4]:
def get_coin_extras(driver, currency_url):
    try:
        driver.get(currency_url)
        try:
            github_link_element = driver.find_element(By.XPATH, "//a[contains(@href, 'github.com')]")
            github_link = github_link_element.get_attribute('href')
        except NoSuchElementException:
            github_link = ""
        try:
            driver.find_element(By.CSS_SELECTOR, ".sc-b7faf77f-1.kpTZfr").click()
            tags_elements = driver.find_elements(By.CSS_SELECTOR, ".cmc-modal .ctYAzo .cmc-link")
            tags = [tag_element.text for tag_element in tags_elements]
        except NoSuchElementException:
            tags_elements = driver.find_elements(By.CSS_SELECTOR, ".jaPKUl .ctYAzo .cmc-link")
            tags = [tag_element.text for tag_element in tags_elements]
            while '' in tags:
                tags.remove('')
    except NoSuchElementException:
        raise Exception("Failed to find elements")
    return {"github_link": github_link, "tags": tags}

# Scarping Github Pages

In [None]:
def check_url(url):  # Function to check currency link
    if url == "?":  # When the currency doesn't have github link
        return -1
    if (url.count("/") == 4) & (
        url[-1] != "/"):  # When the currency has github repository link
        return 1
    if (url.count("/") == 3) | (
        (url.count("/") == 4) & (url[-1] == "/")):  # When the currency has github project link
        return
    return 0  # When the currency has an other link

In [None]:
def get_page(url): # function to send request and return parsed page
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup
    except HTTPError as http_err:
        print("HTTP error occurred: {}".format(http_err))
    except ConnectionError as conn_err:
        print("Connection error occurred: {}".format(conn_err))
    except Timeout as timeout_err:
        print("Timeout error occurred: {}".format(timeout_err))
    except RequestException as req_err:
        print("Request error occurred: {}".format(req_err))
    return None

In [None]:
def scrape_repo(url, id):  # function to scrape repository links and return data
    try:
        page = get_page(url)
        boxs = page.select(".text-small.mr-3")
        languages = []
        for box in boxs:
            language = {}
            elements = box.select("span")
            language["coin_id"] = id
            language["name"] = elements[0].text
            language["percentage"] = float(elements[1].text[:-1])
            languages.append(language)
        data = {}
        data["coin_id"] = id
        commit_element = page.select(".ml-md-3 strong")
        data["commits_count"] = int(
            "".join(filter(str.isdigit, commit_element[0].text))
        )
        contrib_element = page.select(".Link.flex-items-center")
        for element in contrib_element:
            if "Contributors" in element.text:
                data["contributors_count"] = int(
                    "".join(filter(str.isdigit, element.text))
                )
        fork_count = page.find(id="repo-network-counter").get("title")
        data["forks_count"] = int("".join(filter(str.isdigit, fork_count)))
        star_count = page.find(id="repo-stars-counter-star").get("title")
        data["stars_count"] = int("".join(filter(str.isdigit, star_count)))
        return [languages, data]
    except:
        print("error in scraping {}th coin.".format(id))
    return None

In [None]:
def scrape(url, id):  # ّ Function that calls the appropriate function for each currency
    case = check_url(url)
    if case == -1:  # When the currency doesn't have github link
        return None
    if case == 1:  # When the currency has github repository link
        return scrape_repo(url, id)
    if (
        case == 0
    ):  # When the currency has another link and its project link should be separated
        parts = url.split("/")
        url = "/".join(parts[:4])
    return scrape_prjct(url, id)  # When the currency has github project link

In [None]:
# Function to scrape projects links(finding repository link of currency and scrape it)
def scrape_prjct(url, id):
    try:
        page = get_page(url)
        repo_url = "https://github.com" + page.select(".col-lg-6:nth-child(1) a")[
            0
        ].get("href")
        return scrape_repo(repo_url, id)
    except:
        print("error in finding {}th coin link.".format(id))
    return None

In [None]:
# Reading data and fix some links
currency_data = df_extra
currency_data["github_link"].replace(pd.NA, "?", inplace=True)
value_mapping = {
    "https://github.com/solana-labs": "https://github.com/solana-labs/solana",
    "https://github.com/maticnetwork/whitepaper/": "https://github.com/maticnetwork/bor",
    "https://github.com/filecoin-project/": "https://github.com/filecoin-project/lotus",
    "https://github.com/quantnetwork/": "?",
    "https://github.com/chainsulting/Smart-Contract-Security-Audits/tree/master/ApeCoin": "?",
    "https://github.com/thorchain/Resources/tree/master/Whitepapers/THORChain/whitepaper-en.md": "https://github.com/thorchain/THORChain-v1",
    "https://github.com/iotaledger": "https://github.com/iotaledger/wasp",
    "https://github.com/curvefi/curve-contract": "https://github.com/curvefi/curve-stablecoin",
    "https://github.com/gatechain": "https://github.com/gatechain/crypto",
    "https://github.com/trustwallet": "https://github.com/trustwallet/wallet-core",
    "https://github.com/dydxfoundation/": "?",
    "https://github.com/BTCGPU/BTCGPU/wiki/Technical-Spec": "https://github.com/BTCGPU/BTCGPU",
    "https://github.com/singnet/": "https://github.com/singnet/snet-daemon",
    "https://github.com/balancer-labs": "?",
    "https://github.com/kusamanetwork": "?",
    "https://github.com/worldcoin": "?",
    "https://github.com/Solar-network": "https://github.com/Solar-network/core",
    "https://github.com/axelarnetwork": "https://github.com/axelarnetwork/axelar-core",
    "https://github.com/terra-project": "https://github.com/terra-money/core",
    "https://github.com/nervosnetwork/rfcs/blob/master/rfcs/0002-ckb/0002-ckb.md": "https://github.com/nervosnetwork/ckb",
    "https://github.com/UMAprotocol/whitepaper": "?",
    "https://github.com/reserve-protocol": "https://github.com/reserve-protocol/rsv-v2",
    "https://github.com/kybernetwork": "?",
}
currency_data["github_link"] = currency_data["github_link"].replace(value_mapping)

In [None]:
languages_data = []
github_data = []
for i in tqdm(range(0, 200)):  # Iterating over all currencies and scrape data of them
    extra_data = scrape(currency_data["github_link"][i], currency_data["currency_id"][i])
    if extra_data != None:
        languages_data.extend(extra_data[0])
        github_data.append(extra_data[1])
    if (i + 1) % 10 == 0:
        time.sleep(10)

lang_coin_df = pd.DataFrame(languages_data)
git_df = pd.DataFrame(github_data)

languages_df = pd.DataFrame({'name': lang_coin_df['name'].unique()})
languages_df['id'] = range(1, len(languages_df) + 1)
lang_coin_df['language_id'] = lang_coin_df['name'].map(languages_df.set_index('name')['id'])
lang_coin_df = lang_coin_df[['coin_id', 'language_id', 'percentage']]
languages_df = languages_df[['id', 'name']]

lang_coin_df.to_csv("currency_lang_data.csv", index=False)
languages_df.to_csv("languages.csv", index=False)


In [None]:
new_df_extra = df_extra
new_df_extra['tags'] = new_df_extra['tags'].apply(ast.literal_eval)
distinct_tags = pd.unique([tag for sublist in new_df_extra['tags'] for tag in sublist])
tags_df = pd.DataFrame({'id': range(1, len(distinct_tags) + 1),
                    'name': list(distinct_tags)})

corrency_tag_df = pd.DataFrame([(row.rank, tag_id)
                    for row in new_df_extra.itertuples(index=False)
                    for tag_id, tag in tags_df.itertuples(index=False)
                    if tag in row.tags],
                   columns=['corrency_id', 'tag_id'])

new_df_extra.drop('tags', axis=1, inplace=True)
corrency_tag_df.to_csv('currency_tag_data.csv', index=False)
tags_df.to_csv('tags.csv', index=False)

github_df = git_df
merged_df = pd.merge(new_df_extra, github_df, on='currency_id', how='inner')

github_df['github_link'] = merged_df['github_link']
github_df.to_csv('complete_github_data.csv', index=False)

# Get Driver

In [None]:
def get_driver(url, logger):
    try:
        # Make download_directory
        folder_name = "Scraped_data"
        current_directory = os.getcwd()
        parent_directory = os.path.dirname(current_directory)
        download_directory = os.path.join(parent_directory, folder_name)
        if not os.path.exists(download_directory):
            os.mkdir(download_directory)

        # Set download_directory to options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option("prefs", {
            "download.default_directory": download_directory,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        # Create driver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        return driver
    except NoSuchDriverException as err:
        logger.error("No Such Driver occurred: {}".format(err))
    return None

# Scraping Histortical Data

In [None]:
def extract_data(driver, url):
    try:
        # Find and click on date button
        date_button = driver.find_elements(By.CSS_SELECTOR, '.htGqtu button.dalfmx')[0]
        date_button.click()
        time.sleep(1)

        # Find and click on 'Last 365 days' button
        li_element = driver.find_element(By.CSS_SELECTOR, '.heoICr li:last-child')
        driver.execute_script("arguments[0].click();", li_element)
        time.sleep(1)

        # Find and click on 'Continue' button
        continue_button = driver.find_element(By.CSS_SELECTOR, '.bcCCXI')
        driver.execute_script("arguments[0].click();", continue_button)
        time.sleep(5)

        # Find and click on 'Download CSV' button
        download_csv_button = driver.find_elements(By.CSS_SELECTOR, '.htGqtu button.dalfmx')[1]
        download_csv_button.click()
        time.sleep(3)

    except NoSuchElementException as err:
        logger.error("No Such Element occurred: {}".format(err))
    except ElementClickInterceptedException as err:
        logger.error("Element Click Intercepted occurred: {}".format(err))



# Test

In [None]:
if __name__ == "__main__":
    
    # Set up logging configuration
    logging.basicConfig(filename='get_coins_csv_files.log', filemode='w', format='%(asctime)s %(levelname)s: %(message)s')
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # Read urls
    df = pd.read_csv('../Scraped_data/coins_data.csv')
    urls = df['historical_link'].tolist()
    # urls = urls[0:10]

    for url_num in tqdm(range(len(urls))):
        url = urls[url_num]
        driver = get_driver(url, logger)

        if driver != None:
            try:
                extract_data(driver, url)
            except:
                logger.warning('Failed to extract data from the page [{}]'.format(url))

        # Sleep for a random time to avoid being blocked
        time_milliseconds = randint(500, 2000)
        time_sec = 0.001 * time_milliseconds
        logger.info('Sleeping for {} seconds'.format(time_sec))
        time.sleep(time_sec)
        logger.info('Woke up')
