In [None]:
import time
import logging
import concurrent.futures
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor


In [None]:
def fahrenheit_to_celsius(temp_fahrenheit):
    temp_celsius = (temp_fahrenheit - 32) * 5/9
    return temp_celsius

def get_weather_data(year, month, day, browser):
    try:
        browser = webdriver.Firefox()
        browser.get('https://www.wunderground.com/history/daily/il/haifa/LLHA')
        WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, 'yearSelection')))
        
        year_select = Select(browser.find_element(By.ID,'yearSelection'))
        year_select.select_by_value(str((2023-year))+':'+' '+str(year))
        month_select = Select(browser.find_element(By.ID,'monthSelection'))
        month_select.select_by_value(str(month))
        day_select = Select(browser.find_element(By.ID,'daySelection'))
        day_select.select_by_value(str((day-1))+':'+' '+str(day))

        browser.find_element(By.ID, 'dateSubmit').click()
        WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, 'history-ob')))
        
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        data_rows = soup.find_all('tr', {'class': 'no-metars'})
        date_data = {}
        for row in data_rows:
            time_col = row.find('td', {'class': 'indent'})
            time = time_col.get_text()
            temp_col = row.find_all('td')[1]
            temp_fahrenheit = temp_col.get_text()
            temp_celsius = fahrenheit_to_celsius(int(temp_fahrenheit))
            date_data[time] = temp_celsius
    finally:
        browser.quit()
    return date_data

In [None]:
def get_weather_data_for_dates7(current_date, end_date, browser, num_threads=2):
    weather_data = {}
    #current_date = datetime(start_year, start_month, start_day)
    #end_date = datetime(end_year, end_month, end_day)
    problematic_urls = []

    dates = []
    while current_date <= end_date:
        dates.append(current_date)
        current_date += timedelta(days=1)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = [executor.submit(get_weather_data, date.year, date.month, date.day, browser) for date in dates]

        for result in results:
            date_data = result.result()
            weather_data[date_data[0]] = date_data[1]

    return weather_data, problematic_urls


  

In [None]:
def run_in_multiple_threads(current_date, end_date):
    with concurrent.futures.ProcessPoolExecutor() as executor:
        # Create a list of arguments to pass to the function
        args_list = [(current_date, end_date, webdriver.Firefox()) for _ in range(2)]
        # Use the map function to run the function in multiple processes
        results = executor.map(get_weather_data_for_dates7, *args_list)
        # Extract the results from the map object
        weather_data, problematic_urls = zip(*results)
        # Flatten the list of weather data
        weather_data = [item for sublist in weather_data for item in sublist]
        # Flatten the list of problematic URLs
        problematic_urls = [item for sublist in problematic_urls for item in sublist]
    return weather_data, problematic_urls


In [None]:
start_year = 2020
start_month = 1
start_day = 1
end_year = 2020
end_month = 12
end_day = 31
current_date = datetime(start_year, start_month, start_day)
end_date = datetime(end_year, end_month, end_day)
weather_data, problemtaic_urls = run_in_multiple_threads(current_date, end_date)

weather_data_df = pd.DataFrame.from_dict(weather_data).transpose()
weather_data_df = weather_data_df.astype({'Year': int, 'Month': int, 'Day': int})
	
#saving collected data:
weather_data_df.to_csv("weather_data.csv", index=False)