In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

# Manuelle Festlegung der Header
column_headers = ["Id", "Ranked by Filter", "World Rank", "City", "Average Traveltime", "Change From 2022", "Congestion level", "Time lost per year", "Average speed"]

# Initialize the WebDriver
driver = webdriver.Chrome()

# Load the webpage
driver.get("https://www.tomtom.com/traffic-index/ranking/")

# Wait for the page to load completely
driver.implicitly_wait(10)

# Initialize an action to scroll
actions = ActionChains(driver)

# Lists to save all data
all_div_data = []

# Infinite loop to continuously scroll until all contents are loaded
while True:
    # Scroll down
    actions.send_keys(Keys.PAGE_DOWN).perform()
    
    # Wait for new content to load
    time.sleep(1)  # Adjust the waiting time depending on the page load speed

    # Find all row containers with the class 'sc-e3cb6a64-6.kfkObI'
    row_containers = driver.find_elements(By.CLASS_NAME, 'sc-e3cb6a64-6.kfkObI')
    
    # Variable to check if new elements were found
    new_div_data = []
    
    # Iterate through all found row containers
    for row in row_containers:
        # Find all span elements within the current row container
        span_elements = row.find_elements(By.TAG_NAME, 'span')
        
        # Extract the text from the span elements, split by '\n', and take the first part
        span_texts = [span.text.split("\n")[0] for span in span_elements]
        
        # Add to the newly found data if it is not already in the overall list
        if span_texts not in all_div_data and span_texts:
            new_div_data.append(span_texts)
    
    # If no new elements were found, break the loop
    if not new_div_data:
        break
    
    # Add the new data to our overall list
    all_div_data.extend(new_div_data)

# Close the browser
driver.quit()

# Print the first few rows of data for debugging
for i, row in enumerate(all_div_data):
    print(f"Row {i}:", row)

# Ensure all rows have the correct number of elements
valid_data = [row for row in all_div_data if len(row) == len(column_headers)]

# Print the number of valid rows for debugging
print(f"Number of valid rows: {len(valid_data)}")

# Create a DataFrame from the collected data
try:
    df = pd.DataFrame(valid_data, columns=column_headers)
    # Save the DataFrame to a CSV file
    df.to_csv("formatted_scraped_data.csv", index=False)
    print("DataFrame saved successfully.")
except ValueError as e:
    print(f"Error creating DataFrame: {e}")

# Print out the DataFrame to verify the structure
print(df.head())


Row 0: ['1', '1', '1', 'London', '37 min 20 s', '+ 1 min', '45', '148 hours', '14 km/h']
Row 1: ['2', '2', '2', 'Dublin', '29 min 30 s', '+ 1 min', '66', '158 hours', '16 km/h']
Row 2: ['3', '3', '3', 'Toronto', '29 min', '+ 50 s', '42', '98 hours', '18 km/h']
Row 3: ['4', '4', '4', 'Milan', '28 min 50 s', '+ 20 s', '45', '137 hours', '17 km/h']
Row 4: ['5', '5', '5', 'Lima', '28 min 30 s', '+ 1 min 20 s', '61', '157 hours', '17 km/h']
Row 5: ['6', '6', '6', 'Bengaluru', '28 min 10 s', '- 1 min', '63', '132 hours', '18 km/h']
Row 6: ['7', '7', '7', 'Pune', '27 min 50 s', '+ 30 s', '57', '128 hours', '19 km/h']
Row 7: ['8', '8', '8', 'Bucharest', '27 min 40 s', '+ 20 s', '55', '150 hours', '17 km/h']
Row 8: ['9', '9', '9', 'Manila', '27 min 20 s', '+ 20 s', '46', '105 hours', '19 km/h']
Row 9: ['10', '10', '10', 'Brussels', '27 min', '+ 20 s', '37', '104 hours', '18 km/h']
Row 10: ['11', '11', '11', 'Taichung', '26 min 50 s', '- 10 s', '35', '71 hours', '20 km/h']
Row 11: ['12', '12', '