In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
url = 'https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br'

driver.get(url)

In [4]:
# Wait for the container to be available
container = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "divContainerIframeB3"))
)
print("Found the container:", container)


Found the container: <selenium.webdriver.remote.webelement.WebElement (session="60472a3fde58da1dfec6ba3e2f4c8365", element="f.5B225C3BE1B2792367E941E9B605EA9B.d.192C483CE6567CB593729CBD2A7607A5.e.12")>


In [5]:
from datetime import datetime

# Extract the date from the container
date_element = container.find_element(By.XPATH, "//p[contains(text(), 'Carteira Teórica do IBovespa válida para')]")
date_text = date_element.text.split("para")[-1].strip()  # Extracts '17/03/25'

# Convert the date to ISO 8601 format (YYYY-MM-DD)
parsed_date = datetime.strptime(date_text, "%d/%m/%y")  # Parse the date using the original format
formatted_date = parsed_date.strftime("%Y-%m-%d")  # Format it to ISO 8601

# Now formatted_date contains the string in 'YYYY-MM-DD' format
print(formatted_date)


2025-03-17


*** SCRAPE ONE PAGE

In [11]:
# Locate the table within the container
table = container.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

# Extract header
headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, "th")]

# Extract rows
data = []
for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    data.append([col.text for col in cols])

# Convert to Pandas DataFrame
df = pd.DataFrame(data, columns=headers)

# Add the extracted date to the DataFrame as a new column
df["Date"] = formatted_date

# Save the DataFrame to a Parquet file with the date in the name
parquet_file_name = f"ibovespa_{formatted_date}.parquet"
df.to_parquet(f'data\\{parquet_file_name}', engine="pyarrow", index=False)

csv_file_name = parquet_file_name.replace(".parquet", ".csv")  # Replace the extension
df.to_csv(f'data\\{csv_file_name}', index=False, encoding="utf-8")

print(f"DataFrame with date column saved as {parquet_file_name}")


DataFrame with date column saved as ibovespa_2025-03-17.parquet


In [12]:
df.head()

Unnamed: 0,Código,Ação,Tipo,Qtde. Teórica,Part. (%),Date
0,ALOS3,ALLOS,ON NM,476.976.044,439,2025-03-17
1,ABEV3,AMBEV S/A,ON ED,4.394.835.131,2867,2025-03-17
2,ASAI3,ASSAI,ON NM,1.345.832.968,491,2025-03-17
3,AURE3,AUREN,ON NM,323.738.747,119,2025-03-17
4,AMOB3,AUTOMOB,ON NM,533.959.816,7,2025-03-17


### TRYING TO SCRAPE ALL PAGES

In [10]:
# Function to get the total number of pages
def get_total_pages(driver):
    pagination_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//ul[@class='ngx-pagination']"))
    )
    pages = pagination_element.find_elements(By.TAG_NAME, "li")
    last_page_text = pages[-2].text.strip()
    total_pages = int(''.join(filter(str.isdigit, last_page_text)))  # Clean and convert
    return total_pages

# Initialize an empty list to store data from all pages
all_data = []

# Get the total number of pages
total_pages = get_total_pages(driver)
print(f"Total pages to scrape: {total_pages}")

Total pages to scrape: 5


In [13]:
# Loop through all pages
for page_number in range(1, total_pages + 1):
    print(f"Scraping page {page_number}...")

    # Locate the table on the current page
    table = container.find_element(By.TAG_NAME, "table")
    rows = table.find_elements(By.TAG_NAME, "tr")
    
    # Extract headers (only on the first iteration)
    if page_number == 1:
        headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, "th")]
    
    # Extract row data
    for row in rows[1:]:
        cols = row.find_elements(By.TAG_NAME, "td")
        all_data.append([col.text for col in cols])
    
    # Go to the next page if not on the last one
    if page_number < total_pages:
        next_button = WebDriverWait(driver, 60).until(
            EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'pagination-next')]/a"))
        )
        next_button.click()  # Click the "Next" button
        WebDriverWait(driver, 60).until(EC.staleness_of(table))  # Wait for the new page to load

# Convert collected data to a Pandas DataFrame
df = pd.DataFrame(all_data, columns=headers)

# Add the extracted date to the DataFrame
df["Date"] = formatted_date

# Save the DataFrame to a Parquet file and a CSV file
parquet_file_name = f"data\\ibovespa_{formatted_date}.parquet"
df.to_parquet(parquet_file_name, engine="pyarrow", index=False)

csv_file_name = parquet_file_name.replace(".parquet", ".csv")
df.to_csv(csv_file_name, index=False, encoding="utf-8")

print(f"Data saved: Parquet: {parquet_file_name}, CSV: {csv_file_name}")

Scraping page 1...


TimeoutException: Message: 
