In [37]:
# Libraries
import pandas as pd
import requests
import time
import csv
import os

from bs4 import BeautifulSoup
from pymongo import MongoClient
from IPython.display import display

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


## Set up the BASE URL and Test it

In [38]:
# Any variables defined in CAPITAL belong to the GLOBAL variables
BASE_URL = 'https://finance.yahoo.com'

response = requests.get(BASE_URL)

if response: print("response.ok : {} , response.status_code : {}".format(response.ok , response.status_code))

response.ok : True , response.status_code : 200


In [39]:
mongoClient = MongoClient()
db = mongoClient.cryptocurrencies
# db.segment.drop()

## Set Up the Web Driver
* Ensure the driver is downloaded and within the path

In [40]:

def get_driver(url):
    """Return web driver"""
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--start-maximized') #
    chrome_options.add_argument('--start-fullscreen')#
    chrome_options.add_argument('--single-process')#
    serv = Service(os.getcwd()+'/chromedriver')
    driver = webdriver.Chrome(options=chrome_options, service=serv)
    driver.get(url)
    return driver

## Data Extraction

In [41]:
def get_table_header(driver):
    """Return Table columns in list form """
    header = driver.find_elements(By.TAG_NAME, value= 'th')
    header_list = [item.text for index, item in enumerate(header) if index < 10]
    return header_list

def get_table_rows(driver):
    """Get number of rows available on the page """
    tablerows = len(driver.find_elements(By.XPATH, value='//*[@id="scr-res-table"]/div[1]/table/tbody/tr'))
    return tablerows  

## Data Cleaning

In [42]:
def parse_table_rows(rownum, driver, header_list):
    """get the data for one row at a time and return column value in the form of dictionary"""
    row_dictionary = {}
    #time.sleep(1/3)
    for index , item in enumerate(header_list):
        time.sleep(1/20)
        column_xpath = '//*[@id="scr-res-table"]/div[1]/table/tbody/tr[{}]/td[{}]'.format(rownum, index+1)
        row_dictionary[item] = driver.find_element(By.XPATH, value=column_xpath).text
    return row_dictionary

def parse_multiple_pages(driver, total_crypto):
    """Loop through each row, perform Next button click at the end of page 
    return total_crypto numbers of rows 
    """
    table_data = []
    page_num = 1
    is_scraping = True
    header_list = get_table_header(driver)

    while is_scraping:
        table_rows = get_table_rows(driver)
        print('Found {} rows on Page : {}'.format(table_rows, page_num))
        print('Parsing Page : {}'.format(page_num))
        table_data += [parse_table_rows(i, driver, header_list) for i in range (1, table_rows + 1)]
        total_count = len(table_data)
        print('Total rows scraped : {}'.format(total_count))
        if total_count >= total_crypto:
            print('Done Parsing..')
            is_scraping = False
        else:    
            print('Clicking Next Button')
            element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="scr-res-table"]/div[2]/button[3]')))
            element.click() 
            page_num += 1
    return table_data


## Ingesting Data to Storage, i.e MongoDB

In [43]:
def save_data_tomongo(csv_file, header):
    # Reads data from csv and saves to mongodb
    header = header
    csvfile = open(csv_file, r)
    reader = csv.DictReader(csvfile)


    for each_item in reader:
        row = {}
        for field in header:
            row[field] = each_item[field]

        print(row)
        db.segment.insert(row)

## The Factory Code

In [44]:



def scrape_yahoo_crypto(url, total_crypto, path=None):
    """Get the list of yahoo finance crypto-currencies and write them to CSV file """
    if path is None:
        path = 'crypto-currencies.csv'
    print('Creating driver')
    driver = get_driver(url)
    try:
        table_data = parse_multiple_pages(driver, total_crypto)
    except Exceptions:
        continue
    driver.close()
    driver.quit()
    print('Save the data to a CSV')
    table_df = pd.DataFrame(table_data)
    table_df.to_csv(path, index=None)
    #This return statement is optional, we are doing this just analyze the final output 
    display(table_df.head())
    return table_df 

if __name__ == "__main__" :
    
    YAHOO_FINANCE_URL = BASE_URL+'/crypto'
    TOTAL_CRYPTO = 50
    crypto_df = scrape_yahoo_crypto(YAHOO_FINANCE_URL, TOTAL_CRYPTO,'crypto-currencies.csv')

print("Processing Done")

SyntaxError: 'continue' not properly in loop (1671877557.py, line 10)