# Project 1 - BeautifulSoup vs APIs for Website Scraping
### 27/8/2021



In [59]:
# --- Learning how to Scrape CoinGecko with beautifulsoup

import requests
from bs4 import BeautifulSoup
import pandas as pd

page = requests.get('https://www.coingecko.com/en') # query and get url
soup = BeautifulSoup(page.content, 'html.parser') # parse html and store in 'soup'

# BTC
BTC_title_box = soup.find('a', attrs={'class': 'd-lg-none font-bold', 'href': '/en/coins/bitcoin'}) # gets the <div> of 'd-lg-none font-bold' and its value
BTC_title = BTC_title_box.text.strip() # get the data

BTC_price_box = soup.find('span', attrs={'class': 'no-wrap', 'data-price-btc': '1.0'})
BTC_price = BTC_price_box.text.strip() # .strip() removes leading and trailing characters

# ETH
ETH_title_box = soup.find('a', attrs={'class': 'd-lg-none font-bold', 'href': '/en/coins/ethereum'})
ETH_title = ETH_title_box.text.strip()

ETH_price_box = soup.find('span', attrs={'class': 'no-wrap', 'data-coin-symbol': 'eth'})
ETH_price = ETH_price_box.text.strip()

# Doge
DOGE_title_box = soup.find('a', attrs={'class': 'd-lg-none font-bold', 'href': '/en/coins/dogecoin'})
DOGE_title = DOGE_title_box.text.strip()

DOGE_price_box = soup.find('span', attrs={'class': 'no-wrap', 'data-coin-symbol': 'doge'})
DOGE_price = DOGE_price_box.text.strip()

# --- Using Pandas for data visualisation
dataset = {
    'Coin': [BTC_title, ETH_title, DOGE_title],
    '     Price (USD)': [BTC_price, ETH_price, DOGE_price]
}

woop = pd.DataFrame(dataset)
print(woop)

### Beautifulsoup can take some time to get used to, especially if you aren't used to HTML and
### CSS since you have to navigate and find specific tags to get the data you need. Even if
### you get used it, a lot of time is spent going back to the source code of the chosen
### website which is annoying and time consuming. There are also a few steps needed before you 
### can start actually writing the code for what you want; getting the URL, parsing the URL, 
### storing the URL (for my code). Code isn't very pretty either. That being said, once you get 
### the hang of it, beautifulsoup is a fairly simple module to use.

   Coin      Price (USD)
0   BTC          $48,303
1   ETH        $3,181.50
2  DOGE        $0.280632


In [62]:
# --- Using APIs instead of beautifulsoup to scrape CoinGecko

import requests
import pandas as pd
from pycoingecko import CoinGeckoAPI # importing the CoinGecko API
cg = CoinGeckoAPI()

# BTC
BTC_USD = cg.get_price('bitcoin', 'usd')
BTC = str(BTC_USD)
BTC_price = BTC.strip("bitcoin': {'usd': }}")

# ETH
ETH_USD = cg.get_price('ethereum', 'usd')
ETH = str(ETH_USD)
ETH_price = ETH.strip("{'ethereum': {'usd': }}")

# DOGE
DOGE_USD = cg.get_price('dogecoin', 'usd')
DOGE = str(DOGE_USD)
DOGE_price = DOGE.strip("{'dogecoin': {'usd': }}")

# Data visualisation
dataset = {
    'Coin': ['BTC', 'ETH', 'DOGE'],
    '     Price (USD)': ['$' + BTC_price, '$' + ETH_price, '$' + DOGE_price]
}

woopwoop = pd.DataFrame(dataset)
print(woopwoop)

### For this code, I used the CoinGecko API. As APIs are created according to each specific 
### provider, the functions, utilities, limitations, etc can vary significantly. For example,
### a limitation of the CoinGecko API was how annoying and time consuming it was to isolate 
### the price of coins (extracting BTC_price from BTC_USD with str() and .strip()). An
### advantage of using the CoinGecko API was that I could get all the data of a single coin
### I needed in one fairly short line of code (cg.get_price()). Also, there were no steps
### required before I could start actually writing useful code - I personally chose to add one
### extra step (cg = CoinGeckoAPI()) because it made helped shorten later lines of code. Note
### that not all website will have public APIs, which means you would have to default to using
### beautifulsoup if so.

   Coin      Price (USD)
0   BTC           $48202
1   ETH         $3182.82
2  DOGE        $0.281025
