# Web scraping Yahoo Finance Stocks Information
This is a collaboration between Ruiz Lorenzo Chavez and Kennmar Pacificar as part of their requirements for their DAT100M class.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import json

In [15]:
def initialize_driver():
    
    driver = webdriver.Chrome()
    driver.refresh()
    driver.implicitly_wait(15)
    
    return driver

def extract_content(driver): 
    
    print("Extracting page")
    page = driver.page_source
    print("Extraction completed")
    
    return page

def next_page(driver):
    
    next_button = driver.find_element(By.XPATH, "//button[@aria-label='Goto next page']")
    next_button.click()
    print("Next page initialized")
    
    return extract_content(driver)

def extract_kpi(page_content):
    
    print("Extracting KPIs")
    
    soup = BeautifulSoup(page_content, "html.parser")
    
    symbol = [x.text.strip() for x in soup.find_all(class_="name yf-1m808gl stacked")]
    name = [x.text.strip() for x in soup.find_all(class_="yf-h8l7j7")]
    price = [x.text.strip() for x in soup.find_all("fin-streamer", {"data-test": "change", "data-field": "regularMarketPrice"})]
    abs_change = [x.text.strip() for x in soup.find_all("fin-streamer", {"data-test": "colorChange", "data-field": "regularMarketChange", "data-tstyle": "default"})]
    perc_change = [x.text.strip() for x in soup.find_all("fin-streamer", {"data-test": "colorChange", "data-field": "regularMarketChangePercent", "data-tstyle": "default"})]
    volume = [x.text.strip() for x in soup.find_all("fin-streamer", {"data-test": "change", "data-field": "regularMarketVolume"})]
    avg_vol_3m = [[y[7].text.strip() for y in [x.find_all("td") for x in soup.find_all("tr", class_="row false yf-paf8n5")]]][0]
    market_cap = [x.text.strip() for x in soup.find_all("fin-streamer", {"data-test": "change", "data-field": "marketCap"})]
    pe_ratio = [[y[9].text.strip() for y in [x.find_all("td") for x in soup.find_all("tr", class_="row false yf-paf8n5")]]][0]
    wk52_chg = [[y[10].text.strip() for y in [x.find_all("td") for x in soup.find_all("tr", class_="row false yf-paf8n5")]]][0]
    key_link = [x["href"] for x in soup.find_all("a", class_="ticker small tw-text-md hover stacked yf-1m808gl")]
    
    print("Extraction completed")
    
    return symbol, name, price, abs_change, perc_change, volume, avg_vol_3m, market_cap, pe_ratio, wk52_chg, key_link

def extract_statistics(page_content):
        
    print("Extracting statistics")
  
    soup = BeautifulSoup(page_content, "html.parser")
    
    stats_list = []
    
    for i in range(2):
        value_row = [x.text.strip() for x in soup.find_all(class_="column yf-14j5zka")[i].find_all(class_="value yf-vaowmx")]
        
        for value in value_row:
            stats_list.append(value)
        
    print("Extraction completed")
           
    return stats_list

def extract_stats_header(page_content):
    
    print("Extracting headers")
    
    soup = BeautifulSoup(page_content, "html.parser")
    
    header_list = []
    
    for i in range(2):
        header_value = [x.text.strip() for x in soup.find_all(class_="column yf-14j5zka")[i].find_all(class_="label yf-vaowmx")]
        
        for header in header_value:
            header_list.append(header)
            
    print("Extraction completed")
           
    return header_list

def append_kpi(extracted_kpi, kpi_dict):
    
    symbol, name, price, abs_change, perc_change, volume, avg_vol_3m, market_cap, pe_ratio, wk52_chg, key_link = extracted_kpi
    
    columns = ["symbol", "name", "price", "abs_change", "perc_change", "volume", "avg_vol_3m", "market_cap", "pe_ratio", "wk52_chg", "key_link"]
    values = [symbol, name, price, abs_change, perc_change, volume, avg_vol_3m, market_cap, pe_ratio, wk52_chg, key_link]
    
    if bool(kpi_dict):
        
        print("Pushing more data") 
        
        for index, column in enumerate(columns):
            kpi_dict[column] += values[index]
        
        print("Data pushed")
        print(f"Company stock data collected: {len(kpi_dict['symbol'])} companies\n")
        
    else:
        
        print("Populating dictionary")
          
        for index, column in enumerate(columns):
            kpi_dict[column] = (values[index])
                
        print("New data pushed")
        print(f"Company stock data collected: {len(kpi_dict['symbol'])} companies\n")   
        
def append_statistics(extracted_statistics, stats_dict, current_symbol, header):
    
    columns = header
    
    value_dict = dict(zip(columns, extracted_statistics))
    
    if bool(stats_dict):
        
        print("Pushing more data") 

        stats_dict['symbol'].append(current_symbol)
        
        for key in value_dict.keys():
            stats_dict[key].append(value_dict[key])
     
        print("Data pushed")
        print(f"{len(stats_dict['symbol'])} stock statistics collected\n")
        
    else:
        
        print("Populating dictionary")
        
        stats_dict['symbol'] = [current_symbol]
        
        for key in value_dict.keys():
            stats_dict[key] = [value_dict[key]]
            
        print("New data pushed")
        print(f"{len(stats_dict['symbol'])} stock statistics collected\n")

    return stats_dict

def export_json(stock_dict, filename):
    
    with open(f"./{filename}", 'w') as file:
        json.dump(stock_dict, file)
        
    print("Exported as JSON")

In [3]:
stock_kpi = {}

main_url = "https://finance.yahoo.com/markets/stocks/52-week-gainers/?start=0&count=100"

driver = initialize_driver()

driver.get(main_url)
print(f"Page initialized: {main_url}")

first_content = extract_content(driver)
first_kpi = extract_kpi(first_content)
append_kpi(first_kpi, stock_kpi)
    
while True:
    
    try:
        next_content = next_page(driver)
        
        sleep(15)
        
        next_kpi = extract_kpi(next_content)
        append_kpi(next_kpi, stock_kpi)
        
    except:
        print("No more pages left")
        break
    
driver.close()

print("\n=================================")
for key in stock_kpi.keys():
    print(f"{key}: {len(stock_kpi[key])}")
    
export_json(stock_kpi, "kpi.json")

Page initialized: https://finance.yahoo.com/markets/stocks/52-week-gainers/?start=0&count=100
Extracting page
Extraction completed
Extracting KPIs
Extraction completed
Populating dictionary
New data pushed
Company stock data collected: 100 companies

Next page initialized
Extracting page
Extraction completed
Extracting KPIs
Extraction completed
Pushing more data
Data pushed
Company stock data collected: 200 companies

Next page initialized
Extracting page
Extraction completed
Extracting KPIs
Extraction completed
Pushing more data
Data pushed
Company stock data collected: 300 companies

Next page initialized
Extracting page
Extraction completed
Extracting KPIs
Extraction completed
Pushing more data
Data pushed
Company stock data collected: 400 companies

Next page initialized
Extracting page
Extraction completed
Extracting KPIs
Extraction completed
Pushing more data
Data pushed
Company stock data collected: 500 companies

Next page initialized
Extracting page
Extraction completed
Extrac

In [11]:
key = stock_kpi["key_link"][0]

driver = initialize_driver()
stats_link = f"https://finance.yahoo.com{key}key-statistics/"
driver.get(stats_link)

page = extract_content(driver)
header = extract_stats_header(page)

driver.close()

Extracting page
Extraction completed
Extracting headers
Extraction completed


In [18]:
stock_statistics = {}

symbols = [symbol for symbol in stock_kpi['symbol']]
key_links = [links for links in stock_kpi['key_link']]

driver = initialize_driver()

for index, key in enumerate(key_links):
        
    stats_link = f"https://finance.yahoo.com{key}key-statistics/"
    
    print(f"Accessing data from {symbols[index]}")
    
    driver.get(stats_link)
    
    statistics_page = extract_content(driver)
    
    try:
        statistics_content = extract_statistics(statistics_page)
        append_statistics(statistics_content, stock_statistics, symbols[index], header)
        
    except:
        pass
    
    sleep(15)

driver.close()

print("\n=================================")
for key in stock_statistics.keys():
    print(f"{key}: {len(stock_statistics[key])}")
    
export_json(stock_statistics, "statistics.json")

Accessing data from WGS
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Populating dictionary
New data pushed
1 stock statistics collected

Accessing data from SEZL
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Pushing more data
Data pushed
2 stock statistics collected

Accessing data from LBPH
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Pushing more data
Data pushed
3 stock statistics collected

Accessing data from SMR
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Pushing more data
Data pushed
4 stock statistics collected

Accessing data from APP
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Pushing more data
Data pushed
5 stock statistics collected

Accessing data from CHYI
Extracting page
Extraction completed
Extracting statistics
Extraction completed
Pushing more data
Data pushed
6 stock statistics collected

Acces

In [37]:
kpi_df = pd.DataFrame(stock_kpi).drop_duplicates(subset=["symbol"])
statistics_df = pd.DataFrame(stock_statistics).drop_duplicates(subset=["symbol"])

stocks_df = pd.merge(kpi_df, statistics_df, how="inner", on="symbol")
display(stocks_df)
stocks_df.to_json("stocks.json")
stocks_df.to_csv("stocks.csv")

Unnamed: 0,symbol,name,price,abs_change,perc_change,volume,avg_vol_3m,market_cap,pe_ratio,wk52_chg,...,Forward Annual Dividend Rate 4,Forward Annual Dividend Yield 4,Trailing Annual Dividend Rate 3,Trailing Annual Dividend Yield 3,5 Year Average Dividend Yield 4,Payout Ratio 4,Dividend Date 3,Ex-Dividend Date 4,Last Split Factor 2,Last Split Date 3
0,WGS,GeneDx Holdings Corp.,78.39,-1.04,-1.31%,185783,591117,2.182B,-,"4,405.17%",...,--,--,0.00,0.00%,--,0.00%,--,--,1:33,5/4/2023
1,SEZL,Sezzle Inc.,424.03,+9.08,+2.19%,52619,106923,2.378B,44.82,"4,098.32%",...,--,--,0.00,0.00%,--,0.00%,--,--,--,--
2,LBPH,"Longboard Pharmaceuticals, Inc.",59.98,+0.02,+0.03%,655798,1.737M,2.341B,-,"1,414.65%",...,--,--,0.00,0.00%,--,0.00%,--,--,--,--
3,SMR,NuScale Power Corporation,29.65,+1.59,+5.67%,11.514M,10.349M,2.998B,-,841.27%,...,--,--,0.00,0.00%,--,0.00%,--,--,--,--
4,APP,AppLovin Corporation,336.75,+8.19,+2.49%,2.715M,5.499M,110.262B,102.05,801.12%,...,--,--,0.00,0.00%,--,0.00%,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590,AVDX,"AvidXchange Holdings, Inc.",11.44,0.00,0.00%,654477,1.955M,2.354B,-,3.62%,...,--,--,0.00,0.00%,--,0.00%,--,--,--,--
1591,CTRA,Coterra Energy Inc.,26.72,+0.04,+0.15%,2.485M,6.029M,19.682B,16.19,3.61%,...,0.84,3.14%,0.83,3.11%,4.29,50.30%,11/27/2024,11/14/2024,2:1,8/15/2013
1592,REXR-PB,"Rexford Industrial Realty, Inc.",23.10,+0.07,+0.31%,18879,7524,3.895B,26.10,3.59%,...,1.47,6.36%,1.63,7.09%,6.01,--,12/31/2024,12/16/2024,--,--
1593,IPAR,"Interparfums, Inc.",137.66,-0.43,-0.31%,77888,147522,4.423B,29.48,3.53%,...,3,2.18%,2.88,2.08%,1.68,61.56%,12/31/2024,12/16/2024,3:2,6/2/2008
