In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin


def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)

                
def is_absolute(url):
    """Determine whether URL is absolute or relative."""
    return bool(url.startswith(('http:', 'https:')))


banned_types = [".ogg", ".mp3", ".jpg", ".jpeg", ".png"]

banned_urls = ["/people", "/tags", "/forum", "/license", "/cookies", "/attribution", "/blog", "blog.", "/help", "/web", "username:", "previews/", "displays/"]


def exclude_file_type(href):
    href = href.lower()
    for b in banned_types:
        if href.endswith(b):
            return True
    
    return False


def exclude_path(href):
    href = href.lower()
    for b in banned_urls:
        if b in href:
            return True
        
    return False


def scrape_and_download(base_url, start_path="/"):
    visited = set()
    to_visit = [start_path]
    
    while to_visit:
        path = to_visit.pop(0)  # Take the first URL from the queue
        url = urljoin(base_url, path)
        
        if url in visited:
            continue
            
        visited.add(url)

        #print(f"Exploring: {url}")
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException:
            print(f"Failed to get {url}")
            continue

        before = len(to_visit)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']

            if exclude_file_type(href):
                continue

            # Only consider links within the same domain
            if base_url not in href and not href.startswith('/') and not href.startswith('.?'): #and not is_absolute(href):
                #print(f"skipping href={href}")
                continue
            
            
            if href.endswith('.wav'):
                print(f"Found file link: {href}")
                for note in ['C2', 'C3', 'C4', 'C5', 'C6']:
                    if note in href:
                        full_href_url = urljoin(base_url, href)
                        local_filename = os.path.basename(href)
                        download_file(full_href_url, local_filename)
                        print(f"Downloaded {local_filename}")
            elif not href.endswith('.wav'):  # To avoid adding .wav files as directories
                if not exclude_path(href):
                    to_visit.append(href)
                
        added = len(to_visit) - before
        print(f"Added {added} links from {url}, total to explore={len(to_visit)}")
                

                
#scrape_and_download("https://freewavesamples.com")
scrape_and_download("https://freesound.org/")

