### Setup 
*needs Google Chrome with Chrome Driver*

In [None]:
import os
import requests
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [None]:
# function for date conversion from the Earthcam format to suitable one 
def convert_month_to_number(date_str):
    """Converts month names in a date string to their numeric equivalents."""
    month_map = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    
    for month_name, month_number in month_map.items():
        if month_name in date_str:
            date_str = date_str.replace(month_name, month_number)
            break  

    return date_str

### Download function 

In [None]:
# function to download the images from a list of URLs to a saving directory 
def download_images(images_info, save_dir, filter_dates=None, city_name = 'unkown'):
    # Create a directory to save images if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # If filter_dates is provided, convert all dates to a consistent format
    if filter_dates:
        filter_dates = [date.replace(' ', '-').replace('/', '-').replace(',','') for date in filter_dates]

    # Iterate over each image URL and date in the list and download the image
    i = 0
    for img_url, date_text in images_info:
        # Check if the date_text matches any of the dates in the filter_dates list
        if filter_dates and not any(date in date_text for date in filter_dates):
            continue  # Skip this image if the date doesn't match the filter

        response = requests.get(img_url)
        if response.status_code == 200:
            # Use the date text to create a unique filename
            image_name = os.path.join(save_dir, f"{city_name}_{date_text}_{i}.jpg")
            with open(image_name, 'wb') as img_file:
                img_file.write(response.content)
            print(f"Successfully downloaded: {image_name}")
        else:
            print(f"Failed to download image from {img_url}. Status code: {response.status_code}")
        i +=1 

### Scraping function 

In [22]:
'''Scraping function which allows to enter an end date (last day to be scraped) and an stop date (day before the first day you want to scrape) as well
as filter dates which should be included in the scraping process. The function uses the Chrome driver to enter input to select the correct dates and to click
the load more button.'''
def scrape_images(url, filter_dates, stop_date, end_date=None, city_name = 'unknown', save_dir = None):
    # Set up the Selenium WebDriver - necessary for Scraping 
    driver = webdriver.Chrome()  
    driver.get(url)

    # Input the end date into the date picker field if provided - last day which should be scraped 
    if end_date:
        try:
            date_field = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, 'content_date_end'))
            )

            # Scroll into view to date field and set the end date 
            driver.execute_script("arguments[0].scrollIntoView();", date_field)
            driver.execute_script(f"arguments[0].value = '{end_date}';", date_field)

            try: # then click the search button 
                button = driver.find_element(By.ID, 'hof_search_button')
                driver.execute_script("arguments[0].click();", button)
                print(f"End date {end_date} input into date picker field.")
            except:
                print("No search button found or failed to click.")

            

        except Exception as e:
            print(f"Failed to input end date: {e}")
            driver.quit()
            return
    

    # Create a set to track downloaded image URLs - set as the URLs might repeat due to the 'scrolling' behaviour 
    downloaded_urls = set()
    
    # List to hold image information
    images_info = []
    flag = True # needed to stop if stop_date is found 
    # Simulate scroll events to load additional content
    while(flag):  
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the dynamically loaded content to appear
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "pic"))
        )

        # Extract and print the newly loaded images
        scroll_html = driver.page_source
        soup = BeautifulSoup(scroll_html, "html.parser")
        
        # Find all divs with class 'pic'
        pic_divs = soup.find_all('div', class_='pic')
        
        stop_scrolling = False  # Flag to stop scrolling if specific date is reached
        
        for pic_div in pic_divs:
            # Find the <a> tag within the div
            a_tag = pic_div.find('a')
            if a_tag and 'href' in a_tag.attrs:
                img_url = a_tag['href']
                
                # Skip already downloaded images
                if img_url in downloaded_urls:
                    continue
                
                # Find the date 
                date_div = pic_div.find('div', class_='thumbcaption hof_date hof_smalltext')
                if date_div:
                    date_text = date_div.get_text(strip=True)
                else:
                    date_text = "unknown_date"

                # Clean the date text to be a valid filename
                date_text = convert_month_to_number(date_text)
                date_text = date_text.replace(' ', '-').replace('/', '-').replace(',','')
                
                images_info.append((img_url, date_text))
                downloaded_urls.add(img_url)

                # Check if the date_text matches the stop_date
                if stop_date and stop_date in date_text:
                    stop_scrolling = True
                    break
        
        if stop_scrolling:
            print(f"Stopping scroll as stop date {stop_date} is reached.")
            flag = False
            break
        
        # Check for and click on download buttons
        try:
            button = driver.find_element(By.ID, 'hof_load_more_button')
            driver.execute_script("arguments[0].click();", button)
            print("Download button clicked.")
        except:
            print("No download button found or failed to click.")

    # Close the WebDriver
    driver.quit()
    print(images_info)
    # Download images with date in filenames
    download_images(images_info, save_dir, filter_dates, city_name)





In [16]:
save_dir = 'downloaded_images'
filter_dates = ['07-30-2022', '07-31-2022']
stop_date = '07/29/2022'  
end_date = '08/01/2022'  
city_name = 'London'
target_url = 'https://www.earthcam.com/world/england/london/abbeyroad/?cam=abbeyroad_uk'

if __name__ == "__main__":
    scrape_images(target_url, filter_dates, stop_date, end_date, city_name, save_dir)

End date 08/01/2022 input into date picker field.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.


In [24]:
save_dir = 'downloaded_images'
filter_dates = ['07-30-2022', '07-31-2022']
stop_date = '07-28-2022'  
end_date = '08/01/2022'  
city_name = 'New_York'
target_url = 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1'

if __name__ == "__main__":
    scrape_images(target_url, filter_dates, stop_date, end_date, city_name, save_dir)

End date 08/01/2022 input into date picker field.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Download button clicked.
Stopping scroll as stop date 07-28-2022 is reached.
[('https://static.earthcam.com//hof/newyork/timessquare/1722590809367_51.jpg', '1-hour-ago'), ('https://static.earthcam.com//hof/newyork/timessquare/1722583081777_33.jpg', '

In [7]:
cities = [
        {
            'city': 'London',
            'save_dir': 'Earthcam_London',
            'url': 'https://www.earthcam.com/world/england/london/abbeyroad/?cam=abbeyroad_uk',
            'dates': [{'end_date': 1, 'stop_date':1 , 'filter_dates': []}, 
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []},
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []}]
        },
        {
            'city': 'New York',
            'save_dir': 'Earthcam_NewYork',
            'url': 'link',
            'dates': [{'end_date': 1, 'stop_date': 1, 'filter_dates': []}, 
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []},
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []}]
        },
        {
            'city': 'Taiwan',
            'save_dir': 'Earthcam_Taiwan',
            'url': 'a',
            'dates': [{'end_date': 1, 'stop_date': 1, 'filter_dates': []}, 
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []},
                      {'end_date': 1, 'stop_date': 1, 'filter_dates': []}]
        }
    ]

In [8]:
save_dir = 'downloaded_images'
filter_dates = ['07-30-2022', '07-31-2022']
stop_date = '07-28-2022'  
end_date = '08/01/2022'  
city_name = 'New_York'
target_url = 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1'
for city in cities:
    for date_dict in city['dates']:
        print(city['url'], date_dict['filter_dates'], date_dict['stop_date'], date_dict['end_date'], city['city'], city['save_dir'])
        #scrape_images(city['url'], date_dict['filter_dates'], date_dict['stop_date'], date_dict['end_date'], city['city'], city['save_dir'])
        

https://www.earthcam.com/world/england/london/abbeyroad/?cam=abbeyroad_uk [] 1 1 London Earthcam_London
https://www.earthcam.com/world/england/london/abbeyroad/?cam=abbeyroad_uk [] 1 1 London Earthcam_London
https://www.earthcam.com/world/england/london/abbeyroad/?cam=abbeyroad_uk [] 1 1 London Earthcam_London
link [] 1 1 New York Earthcam_NewYork
link [] 1 1 New York Earthcam_NewYork
link [] 1 1 New York Earthcam_NewYork
a [] 1 1 Taiwan Earthcam_Taiwan
a [] 1 1 Taiwan Earthcam_Taiwan
a [] 1 1 Taiwan Earthcam_Taiwan
