In [1]:
import csv
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [6]:


# --- Function to extract data for 4th structure from a page ---
def extract_company_info(driver):
    data = {}
    
    def safe_find_text(by, selector):
        try:
            return driver.find_element(by, selector).text.strip()
        except NoSuchElementException:
            return ""
    
    # Extract all key-value blocks inside the container
    # Blocks with label and value classes
    
    blocks = driver.find_elements(By.CSS_SELECTOR, "div.block-QCJM7wcY")
    for block in blocks:
        try:
            label = block.find_element(By.CSS_SELECTOR, "div.label-QCJM7wcY").text.strip()
            # The value could be inside an <a> or <div> with class value-QCJM7wcY
            try:
                value = block.find_element(By.CSS_SELECTOR, "a .value-QCJM7wcY").text.strip()
            except NoSuchElementException:
                value = block.find_element(By.CSS_SELECTOR, "div.value-QCJM7wcY").text.strip()
            
            data[label] = value
        except NoSuchElementException:
            continue
    
    # Extract the about/description paragraph (last div with content-H16icEW0)
    try:
        description = driver.find_element(By.CSS_SELECTOR, "div.container-H16icEW0 div.content-H16icEW0 span").text.strip()
        data['Description'] = description
    except NoSuchElementException:
        data['Description'] = ""
    
    return data

# --- Main scraping logic ---
def main():
    # Chrome options to run headless (optional)
    options = Options()
    # options.add_argument("--headless")  # Uncomment if you want no browser UI
    
    driver = webdriver.Chrome(options=options)

    # Read links from CSV file (symbol_link.csv)
    links = []
    with open("symbol_link.csv", "r", newline="", encoding="utf-8") as file:
        reader = csv.reader(file)
        # Assuming the CSV has links in first column; skip header if any
        for row in reader:
            if row and "http" in row[0]:
                links.append(row[0])

    all_data = []
    
    for link in links:
        print(f"Processing: {link}")
        driver.get(link)
        time.sleep(3)  # Wait for page load; adjust as needed

        company_data = extract_company_info(driver)
        company_data['URL'] = link  # Save source link for reference
        all_data.append(company_data)

    driver.quit()

    # Save all extracted data to CSV
    df = pd.DataFrame(all_data)
    df.to_csv("extracted_company_info.csv", index=False, encoding="utf-8")
    print("Scraping done and data saved to extracted_company_info.csv")

if __name__ == "__main__":
    main()



Scraping done and data saved to extracted_company_info.csv


In [7]:
symbol_links_df = pd.read_csv("symbol_link.csv")
# print(symbol_links_df.columns)
exchange_list = symbol_links_df["Exchange "].tolist()
link_list = symbol_links_df[" Symbol"].tolist()

print(exchange_list)
print(link_list)
driver = webdriver.Chrome()


['NYSE ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'AMEX ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'AMEX ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NASDAQ ', 'NYSE ', 'AMEX ', 'NA

In [None]:


def sanitize_text(text):
    # Remove special unicode characters and extra whitespace
    cleaned = re.sub(r'[^\x00-\x7F]+', '', text)  # keep only ASCII
    return cleaned.strip()


company_info = []

for i in range(len(exchange_list)):
    url = f"https://www.tradingview.com/symbols/{exchange_list[i].strip()}-{link_list[i].strip()}/"
    driver.get(url)
    time.sleep(3)

    records = driver.find_elements(By.CSS_SELECTOR, ".wrapper-QCJM7wcY")

    sub_info = []
    for record in records:
        children = record.find_elements(By.CLASS_NAME, "apply-overflow-tooltip")
        for j in range(min(23, len(children))):
            raw_text = children[j].text
            cleaned_text = sanitize_text(raw_text)
            sub_info.append(cleaned_text)

    company_info.append(sub_info)


    with open('extract_company_info.csv', 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        row = [f"{exchange_list[i].strip()}-{link_list[i].strip()}"] + sub_info
        writer.writerow(row)

driver.quit()
print(company_info)


[['July 23', 'Q3 2025', '1.42USD', '88.81BUSD', '2.93TUSD', '0.52%', '31.09', '6.44USD', '93.74BUSD', '391.04BUSD', '14.92B', '1.09', '164K', '+3K +1.86%', '2.38MUSD', '571.56KUSD', 'Electronic Technology', 'Electronic Technology', 'Telecommunications Equipment', 'Telecommunications Equipment', 'Timothy Donald Cook', 'apple.com', 'apple.com', 'Cupertino', '1976', 'BBG000B9XRY4'], ['July 24', 'Q2 2025', '2.65USD', '16.59BUSD', '257.65BUSD', '2.39%', '48.34', '5.92USD', '6.02BUSD', '62.75BUSD', '928.43M', '0.71', '293.4K', '11.9K 3.90%', '213.88KUSD', '20.50KUSD', 'Technology Services', 'Technology Services', 'Packaged Software', 'Packaged Software', 'Arvind Krishna', 'ibm.com', 'ibm.com', 'Armonk', '1911', 'BBG000BLNNH6']]


In [10]:
firstList = company_info[0]
print(firstList)

['August 19', 'Q3 2025', '1.37USD', '1.66BUSD', '33.26BUSD', '0.83%', '29.53', '4.07USD', '1.29BUSD', '6.51BUSD', '282.92M', '0.84', '17.9K', '200 1.10%', '363.69KUSD', '72.01KUSD', 'Health Technology', 'Health Technology', 'Medical Specialties', 'Medical Specialties', 'Padraig Mcdonnell', 'agilent.com', 'agilent.com', 'Santa Clara', '1999', 'BBG000C2V3D6']


In [None]:







driver.quit()