In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
import time
import os


In [21]:
driver = webdriver.Chrome()
driver.maximize_window()

wait = WebDriverWait(driver, 5)

def wait_for_page_to_load(driver, wait):
    
    page_title = driver.title 
    
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") =="complete"
        )
    except:
        print(f"the page title {page_title} is not completely loaded")
    else:
        print(f"the page title {page_title} is completely loaded")



driver.get("https://finance.yahoo.com/")
wait_for_page_to_load(driver , wait)


# hovering on Markets menu
actions = ActionChains(driver)

menu_button = wait.until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/a" ))
    )
actions.move_to_element(menu_button).perform()

# Clicking ternding_ticket

trending_ticket = wait.until(
    EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/header/div/div/div/div[4]/div/div/ul/li[3]/div/ul/li[4]/a'))
)
trending_ticket.click()
wait_for_page_to_load(driver, wait)


## Clicking on most active

most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/main/section/section/section/section/section[1]/div/nav/ul/li[1]/a"))
    )
most_active.click()
wait_for_page_to_load(driver, wait)





# Scraping all data

data = []


while True:
    # scraping data from the webpage
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )

    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")

    for row in rows:
        values = row.find_elements(By.TAG_NAME, "td")
        stock = {
			"symbol": values[0].text,
            "name" : values[1].text,
            "price" : values[3].text,
            "change" : values[4].text,
            "change_%" : values[5].text,
            "volume" : values[6].text,
			"market_cap": values[8].text,
			"pe_ratio": values[9].text
        }
        data.append(stock)
        
  # remove this if you want to scrape all rows, not just the first one

    try:
        next_button = wait.until(
            EC.element_to_be_clickable((
                By.XPATH,
                "/html/body/div[2]/main/section/section/section/section/section[1]/div/div[3]/div[3]/button[3]"
            ))
        )
    except:
        print("Next button is not clickable. All the pages have been scraped")
        break
    else:
        next_button.click()
        print("Next page")
        time.sleep(2)


driver.quit()



the page title Yahoo Finance - Stock Market Live, Quotes, Business & Finance News is completely loaded
the page title Yahoo Finance - Stock Market Live, Quotes, Business & Finance News is completely loaded
the page title Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance is completely loaded
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next page
Next button is not clickable. All the pages have been scraped


In [22]:
len(data)

349

In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.DataFrame(data)

In [25]:
df.head(20)

Unnamed: 0,symbol,name,price,change,change %,volume,market_cap,pe_ratio
0,NVDA,NVIDIA Corporation,173.72,-4.15,-2.33%,202.637M,4.237T,56.22
1,AMZN,"Amazon.com, Inc.",214.75,-19.36,-8.27%,119.616M,2.29T,32.79
2,LCID,"Lucid Group, Inc.",2.42,-0.04,-1.63%,91.283M,7.435B,--
3,VALE,Vale S.A.,9.71,0.18,+1.89%,89.435M,41.45B,7.96
4,TSLA,"Tesla, Inc.",302.63,-5.64,-1.83%,88.358M,976.118B,179.07
5,SOFI,"SoFi Technologies, Inc.",21.23,-1.35,-5.98%,89.015M,25.395B,42.46
6,AAPL,Apple Inc.,202.38,-5.19,-2.50%,97.204M,3.003T,30.66
7,NIO,NIO Inc.,5.01,0.14,+2.87%,79.291M,10.815B,--
8,HOOD,"Robinhood Markets, Inc.",99.9,-3.15,-3.06%,75.731M,88.779B,50.71
9,INTC,Intel Corporation,19.31,-0.49,-2.47%,86.146M,84.52B,--


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   symbol      349 non-null    object
 1   name        349 non-null    object
 2   price       349 non-null    object
 3   change      349 non-null    object
 4   change %    349 non-null    object
 5   volume      349 non-null    object
 6   market_cap  349 non-null    object
 7   pe_ratio    349 non-null    object
dtypes: object(8)
memory usage: 21.9+ KB


In [25]:
# ## used for removing the spaces in rows of each columns
# for i in df.columns:
#     if df[i].dtype == "object":
#         df[i].str.strip()
    

In [29]:
downloads = os.path.join(os.path.expanduser("~"), "Downloads")
file_path = os.path.join(downloads, "yahoo-stocks-data(Uncleaned_data).xlsx")
df.to_excel(file_path, index=False)

In [26]:
df.columns

Index(['symbol', 'name', 'price', 'change', 'change %', 'volume', 'market_cap',
       'pe_ratio'],
      dtype='object')

In [32]:
stocks_df = (
    pd.DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
    .assign(
        price=lambda df_: pd.to_numeric(df_.price.str.replace(",", "")),
        change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "").str.replace(",", "")),
        change_pct=lambda df_: pd.to_numeric(df_["change_%"].str.replace("%", "").str.replace(",", "")),
        volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "").str.replace(",", ""),errors="coerce"),
        market_cap=lambda df_: df_.market_cap.apply(
            lambda val: float(val.replace(",", "").replace("B", "")) if "B" in val else
                        float(val.replace(",", "").replace("T", "")) * 1000 if "T" in val else np.nan
        ),
        pe_ratio=lambda df_: (
            df_
            .pe_ratio
            .replace(["--", "-"], np.nan)
            .str.replace(",", "")
            .pipe(pd.to_numeric, errors="coerce")
        )
    )
    .rename(columns={
        "price": "price_usd",
        "volume": "volume_M",
        "market_cap": "market_cap_B",
        "change_%": "change_pct"
    })
)


In [33]:
downloads = os.path.join(os.path.expanduser("~"), "Downloads")
file_path = os.path.join(downloads, "yahoo-stocks-data.xlsx")
stocks_df.to_excel(file_path, index=False)

