# Action plan :
## Go to yahoo finance : https://finance.yahoo.com/ 
## Then go to markets(hover) -> Trending tickers(click) -> most active(click)
## Then scrap all the data from there for all pages

In [2]:
#Initializing driver
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.maximize_window()

#Explicit wait
wait = WebDriverWait(driver, 10)

#Function for Checking if document loaded fully
def waitToLoad(driver, wait):
    page_title = driver.title
    try:
        wait.until(
            lambda d : d.execute_script("return document.readyState") == "complete"
        )
    except Exception as e:
        print("Page didn't load fully within time limit. ")
        print("Error generated :",e)
    else:
        print("Page ",page_title," loaded successfully!")



#Opening website
url = "https://finance.yahoo.com/"
driver.get(url)

#Check if loading is done
waitToLoad(driver, wait)


#Hovering to the 'market' menu
actions = ActionChains(driver)
#First check whether market menu exist or not
market_menu = wait.until(
    EC.presence_of_element_located((By.XPATH,  '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
#Once located hover to market menu
actions.move_to_element(market_menu).perform()


#First check if the 'trending tickers' is clickable or not
trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, "//div[contains(text(),'Trending Tickers')]"))
)
#Then click once checked 
trending_tickers.click()
waitToLoad(driver, wait)


#Now similler process for 'most active' option
most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='Most Active']"))
)
#Then click once checked 
most_active.click()
waitToLoad(driver, wait)

#A list to store all the data rows
data=[]

#Scraping all the data
while True:
    #scrap current table
    #First check if the table exist
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME,"table"))
    )
    rows = driver.find_elements(by = By.CSS_SELECTOR, value = "table tbody tr")
    for row in rows:
        values=row.find_elements(By.TAG_NAME,"td")
        stock={
            "Name" : values[1].text,
            "Symbol" : values[0].text,
            "Price" : values[3].text,
            "Change" : values[4].text,
            "Volume": values[6].text,
            "Market cap" : values[8].text,
            "PE_ratio" : values[9].text
        }
        data.append(stock)
    #Click next
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Goto next page']"))
        )
    except:
        print("The 'next' button is not clickable anymore.End of all the pages.")
        break
    else:
        #Sometimes clicking gives error due to unwanted pop-ups so avoid them using try-except, also using action-chains class to scroll to element before clicking
        try:
            if next_btn.get_attribute("disabled") is not None:
                print("Next button is disabled. Reached the last page.")
                break
            actions.move_to_element(next_btn).click().perform()
        except:
            print("Something went wrong!")
        else:
           time.sleep(2) #Wait till the table loads completely.


driver.quit()



Page  Yahoo Finance - Stock Market Live, Quotes, Business & Finance News  loaded successfully!
Page  Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance  loaded successfully!
Page  Most Active Stocks: US stocks with the highest trading volume today - Yahoo Finance  loaded successfully!
The 'next' button is not clickable anymore.End of all the pages.


In [4]:
data



[{'Name': 'Lucid Group, Inc.',
  'Symbol': 'LCID',
  'Price': '2.1600',
  'Change': '+0.1100',
  'Volume': '200.074M',
  'Market cap': '6.589B',
  'PE_ratio': '--'},
 {'Name': 'NVIDIA Corporation',
  'Symbol': 'NVDA',
  'Price': '159.34',
  'Change': '+2.09',
  'Volume': '142.314M',
  'Market cap': '3.886T',
  'PE_ratio': '51.23'},
 {'Name': 'Intel Corporation',
  'Symbol': 'INTC',
  'Price': '22.49',
  'Change': '+0.61',
  'Volume': '60.704M',
  'Market cap': '98.101B',
  'PE_ratio': '--'},
 {'Name': 'Ford Motor Company',
  'Symbol': 'F',
  'Price': '11.81',
  'Change': '+0.04',
  'Volume': '60.311M',
  'Market cap': '46.963B',
  'PE_ratio': '9.45'},
 {'Name': 'Tesla, Inc.',
  'Symbol': 'TSLA',
  'Price': '315.35',
  'Change': '-0.30',
  'Volume': '58.042M',
  'Market cap': '1.016T',
  'PE_ratio': '182.28'},
 {'Name': 'Robinhood Markets, Inc.',
  'Symbol': 'HOOD',
  'Price': '94.40',
  'Change': '-3.58',
  'Volume': '56.374M',
  'Market cap': '83.301B',
  'PE_ratio': '53.94'},
 {'Name

## Data cleaning with pandas

In [47]:
#Importing dependencies
import pandas as pd
import numpy as np




In [71]:
#Creating dataframe
stocks_df = pd.DataFrame(data)
#stocks_df['PE_ratio'].str.extract(r"([^0-9.])", expand = False).unique()

array(['-', nan], dtype=object)

In [72]:
#Removing trailing and leading spaces of strings and Renaming 'price' col to 'price_usd' via method-chaining
stocks_df = stocks_df.apply(lambda col : col.str.strip() if col.dtype=="object" else col).rename(columns={
    "Price" : "Price_usd"
})

#To check all unique data-types of my dataframe
#print(stocks_df.dtypes.unique())



#Convert the 'price_usd' col from object to float64 and for 'change' col, first remove'+' sign and then to numeric
stocks_df = stocks_df.assign(
    Price_usd= lambda df_ : pd.to_numeric(df_.Price_usd),
    Change = lambda df_: pd.to_numeric(df_["Change"].astype(str).str.replace("+", "", regex=False))
)


# ***********************************************

#Remove 'M' in volume and rename it to 'Volume_in_M' and to float64
stocks_df = stocks_df.assign(
    Volume = lambda df_ : pd.to_numeric(df_.Volume.astype(str).str.replace("M",""))
)
stocks_df = stocks_df.rename(columns ={
    "Volume" : "Volume_in_M"
})


# ***********************************************


#Rename 'MarketCap' to 'Market_cap' 
stocks_df = stocks_df.rename(columns ={
    "Market cap" : "Market_cap"
})
#Now this is a function to convert all values to billions and in float 
def convert_market_cap_to_billion(col):
    col = col.str.strip()

    # Extract numeric part
    number = pd.to_numeric(col.str[:-1], errors="coerce")

    # Extract suffix
    suffix = col.str[-1]

    # Map suffix to scaling factors
    scale_map = {"M": 1/1000, "B": 1, "T": 1000}
    scale = suffix.map(scale_map)

    # Final conversion
    return number * scale

stocks_df = stocks_df.assign(
    Market_cap_in_B = lambda df_: convert_market_cap_to_billion(df_["Market_cap"])
).drop(columns=["Market_cap"])


# **************************************************

#In'PE_ratio' replace the '-' by np.nan and remove if any ',' is present in values
stocks_df = stocks_df.assign(
    PE_ratio = lambda df_: df_.PE_ratio.str.strip().replace("--", np.nan).str.replace(",", "")
)
#Then convert the datatype to float
stocks_df = stocks_df.assign(
    PE_ratio = lambda df_: pd.to_numeric(df_.PE_ratio)
)



stocks_df
# stocks_df['Change'].isnull().sum()


Unnamed: 0,Name,Symbol,Price_usd,Change,Volume_in_M,PE_ratio,Market_cap_in_B
0,"Lucid Group, Inc.",LCID,2.16,0.11,200.074,,6.589
1,NVIDIA Corporation,NVDA,159.34,2.09,142.314,51.23,3886.0
2,Intel Corporation,INTC,22.49,0.61,60.704,,98.101
3,Ford Motor Company,F,11.81,0.04,60.311,9.45,46.963
4,"Tesla, Inc.",TSLA,315.35,-0.3,58.042,182.28,1016.0
5,"Robinhood Markets, Inc.",HOOD,94.4,-3.58,56.374,53.94,83.301
6,TeraWulf Inc.,WULF,5.26,0.22,53.226,,2.049
7,Cipher Mining Inc.,CIFR,6.05,0.37,47.639,,2.246
8,"SoFi Technologies, Inc.",SOFI,18.57,0.45,44.098,43.19,20.527
9,IREN Limited,IREN,16.82,1.16,42.478,,4.068


## Export as excel or csv

In [75]:
stocks_df.to_csv("Yahoo_Stocks.csv", index=False)