In [44]:
# Script to download all ICAD papers and proceedings volumes held in the Georgia Tech corpus
# Either download EVERYTHING into one folder (not recommended), or download by conference year (recommended).
# To do the former, just comment out the lines under the comment "Base URL for handing conferences by year"
#
# Paul Vickers, 21/02/2025, with much code auto-generated by Gemini.

import requests
from bs4 import BeautifulSoup
import os
import re

# Base URL for the ICAD corpus
# base_url = "https://repository.gatech.edu/entities/series/6cb90d00-3311-4767-954d-415c9341a358"

# Root folder
root_folder = "/content/drive/MyDrive/ICAD_Papers"

# Create a directory to store the downloaded PDFs
if not os.path.exists(root_folder):
    os.makedirs(root_folder)

import requests
from bs4 import BeautifulSoup
import os

# Year range
start_year = 1994
end_year = 2024

# Base URL for the ICAD corpus
# Base URL for all conference combined

query_string_prefix = "?spc.page="
query_string_suffix = "&tab=isSeriesOfPublication&spc.rpp=100"
page=1

# Base URL for handling conferences year by year

query_string_prefix1 = "?spc.page=1&spc.rpp=100&f.dateIssued.min="
query_string_prefix2 = "&f.dateIssued.max="
query_string_suffix = "&f.isSeriesOfPublicationTitle=International%20Conference%20on%20Auditory%20Display%20(ICAD),equals"


fpath = "/content/drive/MyDrive/ICAD_Papers"
# Create a directory to store the downloaded PDFs
if not os.path.exists(fpath):
    os.makedirs(fpath)


# Function to download a PDF
def download_pdf(pdf_url, filename, pdf_exist=True):
    # Sanitize the filename by replacing invalid characters
    filename = re.sub(r'[\\/:*?"<>|]', '_', filename)  # Replace invalid chars with underscore
    # Turn filename casing into Title Case
    filename = filename.title()

    if pdf_exist:
      # Add .pdf extension if not present
      if not filename.endswith(".pdf"):
        filename += ".pdf"
      # Prepend the base URL to the pdf_url if it's a relative path
      if not pdf_url.startswith("http"):
          pdf_url = "https://repository.gatech.edu" + pdf_url
      response = requests.get(pdf_url)
      response.raise_for_status()
      with open(os.path.join(year_folder, filename), "wb") as f:
        f.write(response.content)
      print(f"Downloaded: {filename}")
    else:
      # Write text file with same filename, but contents are filename + :"PDF not found"
      with open(os.path.join(year_folder, filename+".txt"), "w") as f:
        f.write(filename + ":PDF not found")
      print(f"Written: {filename}")


# Fetch the main page to get paper links by year
for year in range (start_year, end_year+1):
#for page in range(1,2):
    print(f"Processing year: {year}")
    #base_url = "https://repository.gatech.edu/entities/series/6cb90d00-3311-4767-954d-415c9341a358
    page_url = base_url + query_string_prefix1 + str(year) + query_string_prefix2 + str(year) + query_string_suffix
    #page_url = base_url + query_string_prefix + str(page) + query_string_suffix
    #print(page_url)
    # Create subfolder for year
    year_folder = os.path.join(fpath, str(year))
    if not os.path.exists(year_folder):
        os.makedirs(year_folder)

    response = requests.get(page_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all paper links on the page
    paper_links = soup.find_all("a", {"class": "lead item-list-title dont-break-out ng-star-inserted"})
    print (f"Found {len(paper_links)} papers.")

    # Iterate through paper links and download PDFs
    for link in paper_links:
        paper_url = link["href"]
        #print(f"Processing: {paper_url}")
        paper_title = link.text.strip()  # Get paper title for filename

        # Go to the individual paper page
        # Build the complete URL by joining base_url and paper_url
        # The paper_url is already a relative path, simply prepend the base domain
        complete_paper_url = "https://repository.gatech.edu" + paper_url
        #print(f"Paper URL: {complete_paper_url}.")
        response = requests.get(complete_paper_url)
        response.raise_for_status()
        paper_soup = BeautifulSoup(response.content, "html.parser")

        # Find the download link for the PDF
        download_link = paper_soup.find("a", {"href": lambda href: href and href.endswith("download")})
        if download_link:
            pdf_url = download_link["href"]
            #print(f"PDF URL: {pdf_url}")

            #Find string from first span in download_link
            #paper_title = download_link.find("span").text.strip()
            #print(f"Filename: {paper_title}")


            #paper_title = download_link.text.strip()

            #print(f"Filename: {filename}")
            # Download the PDF
            download_pdf(pdf_url, paper_title)
        else:
            print(f"No PDF found for: {paper_title}")
            download_pdf(pdf_url, paper_title, False)

print("All papers processed.")

Processing year: 1994
Found 35 papers.
Downloaded: Defining And Redefining Limits On Human Performance In Auditory Spatial Displays.pdf
Downloaded: Auditory Direct Manipulation Of Acoustical Objects By Blind Computer Users.pdf
Downloaded: Dynamical Resonances And Synchronization Of Auditory Stimuli And Evoked Reponses In Multi-Channel Eeg.pdf
Downloaded: Perception Of Virtual Auditory Shapes.pdf
Downloaded: Using Additive Sound Synthesis To Analyze Simplicial Complexes.pdf
Downloaded: Factors In The Design Of Effective Auditory Displays.pdf
Downloaded: Effect Of Event Variations And Sound Duration On Identification Of Everyday Sound.pdf
Downloaded: Auralization Of Document Structure.pdf
Downloaded: The Run-Time Components Of Sonnett.pdf
Downloaded: Using Virtual Environment Technology To Present A Digital Sound Library.pdf
Downloaded: A Perceptual Framework For The Auditory Display Of Scientific Data.pdf
Downloaded: Task-Oriented Quantitative Testing For Synthesized 3-D Auditory Displa