In [3]:
# scraper.py

from selenium import webdriver
import time
import pandas as pd
import numpy as np
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains


class YahooFinanceScraper:
    def __init__(self):
        # Initialize the Chrome driver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 10)
        self.actions = ActionChains(self.driver)
        self.data = []

    def wait_to_load(self):
        try:
            self.wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
        except Exception as e:
            print("Page didn't load fully within time limit. Error:", e)
        else:
            print("Page", self.driver.title, "loaded successfully!")

    def scrape_data(self):
        url = "https://finance.yahoo.com/"
        self.driver.get(url)
        self.wait_to_load()

        # Hover over 'Market' menu
        market_menu = self.wait.until(
            EC.presence_of_element_located((By.XPATH,  '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
        )
        self.actions.move_to_element(market_menu).perform()

        # Click 'Trending Tickers'
        trending_tickers = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, "//div[contains(text(),'Trending Tickers')]"))
        )
        trending_tickers.click()
        self.wait_to_load()

        # Click 'Most Active'
        most_active = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='Most Active']"))
        )
        most_active.click()
        self.wait_to_load()

        # Scrape table rows across all pages
        while True:
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")
                if len(values) >= 10:
                    stock = {
                        "Name": values[1].text,
                        "Symbol": values[0].text,
                        "Price": values[3].text,
                        "Change": values[4].text,
                        "Volume": values[6].text,
                        "Market cap": values[8].text,
                        "PE_ratio": values[9].text
                    }
                    self.data.append(stock)

            try:
                next_btn = self.wait.until(
                    EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Goto next page']"))
                )
                if next_btn.get_attribute("disabled") is not None:
                    print("Next button is disabled. Reached last page.")
                    break
                self.actions.move_to_element(next_btn).click().perform()
                time.sleep(2)
            except:
                print("No more pages or error while clicking next.")
                break

        self.driver.quit()
        return self.data

    def clean_data(self):
        df = pd.DataFrame(self.data)

        df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col).rename(
            columns={"Price": "Price_usd"}
        )

        # Price & Change to numeric
        df = df.assign(
            Price_usd=lambda d: pd.to_numeric(d.Price_usd),
            Change=lambda d: pd.to_numeric(d.Change.astype(str).str.replace("+", "", regex=False))
        )

        # Volume cleanup
        df = df.assign(
            Volume=lambda d: pd.to_numeric(d.Volume.astype(str).str.replace(",", "").str.replace("M", ""))
        ).rename(columns={"Volume": "Volume_in_M"})

        # Market Cap cleanup
        def convert_market_cap_to_billion(col):
            col = col.str.strip()
            number = pd.to_numeric(col.str[:-1], errors="coerce")
            suffix = col.str[-1]
            scale_map = {"M": 1 / 1000, "B": 1, "T": 1000}
            scale = suffix.map(scale_map)
            return number * scale

        df = df.rename(columns={"Market cap": "Market_cap"}).assign(
            Market_cap_in_B=lambda d: convert_market_cap_to_billion(d["Market_cap"])
        ).drop(columns=["Market_cap"])

        # PE_ratio cleanup
        df = df.assign(
            PE_ratio=lambda d: d.PE_ratio.str.strip().replace("--", np.nan).str.replace(",", "")
        ).assign(
            PE_ratio=lambda d: pd.to_numeric(d.PE_ratio)
        )

        return df



In [4]:
scraper = YahooFinanceScraper()

# Step 1: Scrape
raw_data = scraper.scrape_data()

# Step 2: Clean
cleaned_df = scraper.clean_data()

# Save to CSV
cleaned_df.to_csv("Yahoo_Stocks_restruc.csv", index=False)
print("Data saved to Yahoo_Stocks_restruc.csv")

Page Yahoo Finance - Stock Market Live, Quotes, Business & Finance News loaded successfully!
Page Yahoo Finance - Stock Market Live, Quotes, Business & Finance News loaded successfully!
Page Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance loaded successfully!
No more pages or error while clicking next.
Data saved to Yahoo_Stocks_restruc.csv
