In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for explicit
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains # not click only hover on the elemwnt
from selenium.common.exceptions import TimeoutException
import time

In [2]:
driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver,40)

# functin to check if webpage is loades
def wait_for_page_to_load(driver , wait):
    page_title = driver.title
    try:
        wait.until(
        lambda d: d.executr_script('return document.readyState') == 'complete'# d is for driver (executr_script('return document.readyState')) it will check the current State of the page
    )
    except:
        print(f'The page \"{page_title}\" did not get fully loaded within the given duration.')
    else:
        print(f'The page \"{page_title}\" Successfully_loades')
        
url = 'https://finance.yahoo.com/'
driver.get(url)
wait_for_page_to_load(driver , wait)

# hovering on market menues
actions = ActionChains(driver)
# we are firstly checking if the element is present where we want hover or not present
markets_menu = wait.until( 
    EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# to click on tranding tickers
# to check if element is clickable or not
trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)
trending_tickers.click()
wait_for_page_to_load(driver,wait)


#click on most active
most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH,'/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]'))
)
most_active.click()
wait_for_page_to_load(driver , wait)
time.sleep(3)

#scroll the page vertically
driver.execute_script('window.scrollBy(0,1200);') 
time.sleep(3)

#Scraping the data
data = []
#navigating the stocks pages
while True:
    # scraping
    #check table is present or not
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME,"table"))
    )
    rows = driver.find_elements(By.CSS_SELECTOR,"table tbody tr")
    for row in rows:
        values = row.find_elements(By.TAG_NAME, 'td')
        stock = {
            'name': values[1].text,
            'symbol': values[0].text,
            'price': values[3].text,
            'change': values[4].text,
            'volume': values[6].text,
            'market_cap0': values[8].text,
            'pe_ratio': values[9].text,
        
        }
        data.append(stock)

        
    # click next
    try:
        next_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
    )
    except TimeoutException:
        print("The 'next' button is not clickable. We have navigated through all the pages.")
        break
    else:
        next_button.click()
        time.sleep(5)

driver.quit()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" did not get fully loaded within the given duration.
The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" did not get fully loaded within the given duration.
The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" did not get fully loaded within the given duration.
The 'next' button is not clickable. We have navigated through all the pages.


In [3]:
data

[{'name': 'Intel Corporation',
  'symbol': 'INTC',
  'price': '20.70',
  'change': '-1.93',
  'volume': '245.078M',
  'market_cap0': '90.604B',
  'pe_ratio': '--'},
 {'name': 'Tesla, Inc.',
  'symbol': 'TSLA',
  'price': '316.06',
  'change': '+10.76',
  'volume': '148.227M',
  'market_cap0': '1.019T',
  'pe_ratio': '188.13'},
 {'name': 'NVIDIA Corporation',
  'symbol': 'NVDA',
  'price': '173.50',
  'change': '-0.24',
  'volume': '122.317M',
  'market_cap0': '4.231T',
  'pe_ratio': '55.79'},
 {'name': 'Lucid Group, Inc.',
  'symbol': 'LCID',
  'price': '2.9200',
  'change': '-0.0700',
  'volume': '75.189M',
  'market_cap0': '8.907B',
  'pe_ratio': '--'},
 {'name': 'Warner Bros. Discovery, Inc.',
  'symbol': 'WBD',
  'price': '13.49',
  'change': '-0.01',
  'volume': '74.365M',
  'market_cap0': '33.375B',
  'pe_ratio': '--'},
 {'name': 'Centene Corporation',
  'symbol': 'CNC',
  'price': '28.39',
  'change': '+1.63',
  'volume': '63.41M',
  'market_cap0': '13.943B',
  'pe_ratio': '6.89

In [4]:
len(data)

282

In [5]:
stocks_df = (
    pd
    .DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == 'object' else col)
    .assign(
        price=lambda df_: df_['price'].str.replace(r'[^\d.]', '', regex=True)
    )
    .assign(
        price=lambda df_: pd.to_numeric(df_['price']),
        change=lambda df_: pd.to_numeric(df_['change'].str.replace('+', '')),
        volume=lambda df_: pd.to_numeric(df_['volume'].str.replace('M', '')),
        market_cap0=lambda df_: df_['market_cap0'].apply(
            lambda val: float(val.replace('B', '')) if 'B' in val else
                        float(val.replace('T', '')) * 1000 if 'T' in val else
                        float(val.replace('M', '')) / 1000 if 'M' in val else np.nan
        ),
        pe_ratio=lambda df_: df_['pe_ratio']
            .replace(['-', '--'], np.nan)
            .str.replace(',', '')
            .pipe(pd.to_numeric, errors='coerce')
    )
    .rename(columns={
        'price': 'price_usd',
        'volume': 'Volume_M',
        'market_cap0': 'market_cap_B'
    })
)


In [6]:
stocks_df.change

0      -1.93
1      10.76
2      -0.24
3      -0.07
4      -0.01
       ...  
277     0.09
278     2.06
279     0.07
280     0.33
281     0.06
Name: change, Length: 282, dtype: float64

In [7]:
stocks_df.dtypes

name             object
symbol           object
price_usd       float64
change          float64
Volume_M        float64
market_cap_B    float64
pe_ratio        float64
dtype: object

In [8]:
# !pip install openpyxl

In [9]:
stocks_df.to_excel('yahoo-stocks-dataset.xlsx' , index=False)

In [10]:
stocks_df.to_csv('yahoo-stocks-data.csv', index=False)

In [11]:
import pandas as pd

# Re-save using openpyxl engine
with pd.ExcelWriter('yahoo-stocks-final-data.xlsx', engine='openpyxl') as writer:
    stocks_df.to_excel(writer, index=False)
