In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime, timedelta
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

# Function to scrape data for a single day
def scrape_data_for_date(date_str, station_id):
    
    export_file_name = station_id+"_"+date_str+".csv"
    
    if Path(export_file_name).exists():
        print("Day " + date_str + " exists")
        return None
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
    chrome_options.add_argument("--window-size=1920x1080")  # Set window size
    
    # Initialize the WebDriver with the headless options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Construct the URL with the current date
    url = f"https://www.wunderground.com/dashboard/pws/{station_id}/table/{date_str}/{date_str}/daily"
    
    # Open the URL
    driver.get(url)
    
    try:
        # Wait for the table body element to be present in the DOM
        tbody = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, '//html/body/app-root/app-dashboard/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div[2]/section/section[1]/div[1]/div/section/div/div/div/lib-history/div[2]/lib-history-table/div/div/div/table'))
        )
        
        # Extract data from the table
        l = []
        table_rows = tbody.find_elements(By.XPATH, './/tr')

        for row in table_rows:
            row_data = [cell.text for cell in row.find_elements(By.XPATH, './/td')]
            l.append(row_data)
        
        # Convert the data into a pandas DataFrame
        df = pd.DataFrame(l)
        df['Date'] = date_str

    except Exception as e:
        print(f"An error occurred for date {date_str}: {e}")
        df = pd.DataFrame()  # Return an empty DataFrame in case of error

    finally:
        # Close the browser
        driver.quit()
    
    
    df.to_csv(export_file_name)
    
    return df

# Define the start and end dates
start_date = datetime.strptime("2016-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2016-12-31", "%Y-%m-%d")

# Create a list of all dates to process
dates = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
date_strings = [date.strftime("%Y-%m-%d") for date in dates]

# Scrape data sequentially for each date
#results = [scrape_data_for_date(date_str, "KGAATLAN216") for date_str in tqdm(date_strings)]

# Scrape data sequentially for each date in parallel
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(scrape_data_for_date, date_str, "KGAATLAN216"): date_str for date_str in tqdm(date_strings)}


# Concatenate all the DataFrames together
#all_data = pd.concat(results, ignore_index=True)

#print(all_data)


  0%|          | 0/366 [00:00<?, ?it/s]

100%|██████████| 366/366 [00:00<00:00, 2221.47it/s]

Day 2016-01-03 existsDay 2016-01-01 exists
Day 2016-01-02 exists

Day 2016-01-04 exists
Day 2016-01-05 exists
Day 2016-01-06 exists
Day 2016-01-07 exists
Day 2016-01-09 exists
Day 2016-01-14 exists
Day 2016-01-16 exists
Day 2016-01-18 exists
Day 2016-01-19 exists
Day 2016-01-20 exists
Day 2016-01-22 exists
Day 2016-01-24 exists
Day 2016-01-30 exists





An error occurred for date 2016-02-06: Message: stale element reference: stale element not found
  (Session info: headless chrome=117.0.5938.149); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x0000000104c32d68 chromedriver + 4337000
1   chromedriver                        0x0000000104c2ade4 chromedriver + 4304356
2   chromedriver                        0x0000000104857a5c chromedriver + 293468
3   chromedriver                        0x000000010485ccc8 chromedriver + 314568
4   chromedriver                        0x000000010485e8dc chromedriver + 321756
5   chromedriver                        0x000000010485ea08 chromedriver + 322056
6   chromedriver                        0x000000010489caf4 chromedriver + 576244
7   chromedriver                        0x0000000104892200 chromedriver + 532992
8   chromedriver                      