# Fetch Camera Equipment Data

Sadly, the metadata associated the camera and lens that took each photo isn't included in the dataset I downloaded from Macaulay. However, I can scrape the data from the Macaulay website. Since it's only ~3,000 photos that I'm interested in, I shouldn't raise any red flags.

In [23]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

In [24]:
top_records_df = pd.read_csv("../results/top_records.csv")
top_records_df.head()

Unnamed: 0,catalog_number,common_name,scientific_name,photographer,date,year,rating,num_ratings,location
0,427148171,Blue-rumped Parrot,Psittinus cyanurus,Saravanan Krishnamurthy,2022-03-19,2022,5.0,37,Asia
1,522492561,Bornean Banded-Pitta,Hydrornis schwaneri,Wai Loon Wong,2022-12-21,2022,5.0,21,Asia
2,519807861,Surf Scoter,Melanitta perspicillata,Frank Lin,2023-01-02,2023,5.0,54,North America
3,584587231,Black-throated Gray Warbler,Setophaga nigrescens,Joachim Bertrands | Ornis Birding Expeditions,2023-06-13,2023,5.0,54,North America
4,625632330,Scissor-tailed Flycatcher,Tyrannus forficatus,Matt Zuro,2024-10-30,2024,5.0,54,North America


In [25]:
def scrape_macaulay_technical_info(catalog_numbers, base_url="https://macaulaylibrary.org/asset/{}", delay=1.0):
    """
    Scrape technical information from Macaulay Library pages.
    
    Parameters
    ----------
    catalog_numbers : list
        List of catalog numbers to scrape (e.g., ["123456", "789012"])
    base_url : str, optional
        Template URL with {} placeholder for catalog number
        Default: "https://macaulaylibrary.org/asset/{}"
    delay : float, optional
        Time in seconds to wait between requests
        Default: 1.0
    
    Returns
    -------
    pandas.DataFrame
        DataFrame containing technical information
    """
    all_data = []
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Research Bot (Responsible scraping for academic research)'
    })
    
    for catalog_number in catalog_numbers:
        try:
            # Make request
            url = base_url.format(catalog_number)
            response = session.get(url)
            response.raise_for_status()
            
            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the "Technical information" heading
            tech_heading = soup.find('h3', string='Technical information')
            if not tech_heading:
                print(f"No technical information heading found for {catalog_number}")
                continue
                
            # Get the definition list that follows the heading
            tech_section = tech_heading.find_next('dl', class_='defList')
            
            if tech_section:
                # Extract information
                tech_info = {'catalog_number': catalog_number}
                terms = tech_section.find_all('dt')
                definitions = tech_section.find_all('dd')
                
                for term, definition in zip(terms, definitions):
                    column_name = term.text.strip().lower().replace(' ', '_')
                    tech_info[column_name] = definition.text.strip()
                
                all_data.append(tech_info)
            
            # Respect the site by waiting between requests
            time.sleep(delay)
            
        except Exception as e:
            print(f"Error processing {catalog_number}: {str(e)}")
            continue
    
    return pd.DataFrame(all_data)

def scrape_in_batches(catalog_numbers, batch_size=500, delay=1.0, checkpoint_dir="checkpoints"):
    """
    Scrape data in batches with checkpointing.
    
    Parameters
    ----------
    catalog_numbers : list
        List of all catalog numbers to process
    batch_size : int, optional
        Number of items to process per batch (default: 500)
    delay : float, optional
        Delay between requests in seconds (default: 1.0)
    checkpoint_dir : str, optional
        Directory to save checkpoint files (default: "checkpoints")
    
    Returns
    -------
    pandas.DataFrame
        Combined results from all batches
    """
    # Create checkpoint directory if it doesn't exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Timestamp for this run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    results_list = []
    total_batches = len(catalog_numbers) // batch_size + (1 if len(catalog_numbers) % batch_size else 0)
    
    for i in range(0, len(catalog_numbers), batch_size):
        batch_num = i // batch_size + 1
        batch = catalog_numbers[i:i + batch_size]
        
        print(f"\nStarting batch {batch_num} of {total_batches}")
        print(f"Processing catalog numbers {i} to {i + len(batch)}")
        
        # Scrape batch
        batch_results = scrape_macaulay_technical_info(batch, delay=delay)
        results_list.append(batch_results)
        
        # Save checkpoint
        checkpoint_path = os.path.join(
            checkpoint_dir, 
            f'macaulay_results_{timestamp}_batch_{batch_num}_of_{total_batches}.csv'
        )
        batch_results.to_csv(checkpoint_path, index=False)
        print(f"Saved checkpoint: {checkpoint_path}")
        
        # Save combined results so far
        combined_path = os.path.join(
            checkpoint_dir,
            f'macaulay_results_{timestamp}_combined.csv'
        )
        pd.concat(results_list).to_csv(combined_path, index=False)
        print(f"Updated combined results: {combined_path}")
        
        if batch_num < total_batches:
            print(f"Taking a 30 second break before next batch...")
            time.sleep(30)
    
    return pd.concat(results_list)

In [26]:
catalog_numbers = top_records_df['catalog_number'].tolist()
print("Starting scraping process...")
print(f"Total catalog numbers: {len(catalog_numbers)}")
print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Run the scraper with batching
results = scrape_in_batches(
    catalog_numbers,
    batch_size=500,    # Process 500 at a time
    delay=1.0,         # 1 second between requests
    checkpoint_dir="macaulay_checkpoints"
)

print("\nScraping completed!")

Starting scraping process...
Total catalog numbers: 3203
Current time: 2024-12-30 21:20:07

Starting batch 1 of 7
Processing catalog numbers 0 to 500
Saved checkpoint: macaulay_checkpoints/macaulay_results_20241230_212007_batch_1_of_7.csv
Updated combined results: macaulay_checkpoints/macaulay_results_20241230_212007_combined.csv
Taking a 30 second break before next batch...

Starting batch 2 of 7
Processing catalog numbers 500 to 1000
Saved checkpoint: macaulay_checkpoints/macaulay_results_20241230_212007_batch_2_of_7.csv
Updated combined results: macaulay_checkpoints/macaulay_results_20241230_212007_combined.csv
Taking a 30 second break before next batch...

Starting batch 3 of 7
Processing catalog numbers 1000 to 1500
Saved checkpoint: macaulay_checkpoints/macaulay_results_20241230_212007_batch_3_of_7.csv
Updated combined results: macaulay_checkpoints/macaulay_results_20241230_212007_combined.csv
Taking a 30 second break before next batch...

Starting batch 4 of 7
Processing catalog

This takes roughly 1.5 hours to run. I could make it run faster, but I want to be careful not to spam the Macaulay website.

Now, lets do a final bit of processing and save the results. We don't need to keep rows that are missing the camera information. We also want to join the results back to the original data.

In [32]:
filtered_results = results[results['model'].notnull()]
print(f"There are {len(results)-len(filtered_results)} records with missing camera model information and {len(filtered_results)} records with camera model information.")

There are 487 records with missing camera model information and 2716 records with camera model information.


In [34]:
joined_results = filtered_results.merge(top_records_df, on='catalog_number', how='left')
joined_results.to_csv("../results/camera_specs.csv", index=False)