In [1]:
# Original

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import timeit

In [2]:
# Original
def extract_urls(url, filter_artists=False):
    urls = set();  # Use a set to avoid duplicates

    response = requests.get(url) # Sends an HTTP GET request to the provided URL and stores the response.
    if response.status_code == 200: # If the response status code is 200 (indicating success), it creates a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(response.content, 'html.parser');

        for a_tag in soup.find_all('a', href=True): # Iterates through all `a` tags with an `href` attribute, extracting the `href` values.
            href = a_tag['href'];
            full_url = urljoin(url, href); # Join the URL if it's a relative link

            if filter_artists:
                # Regex pattern for artist pages URLs only
                pattern = r"http://mp3-2003\.computer-legacy\.com/artists/\d+/.+\.html"

                if re.match(pattern, full_url): # Checks if the URL matches the specified pattern using regex.
                    urls.add(full_url);
            else:
                urls.add(full_url);
    return list(urls);

In [3]:
# Optimizado
def extract_urls_optimized(url, filter_artists=False):
    session = requests.Session()
    urls = set()  # Use a set to avoid duplicates

    response = session.get(url)  # Use the session for making the request
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Compile the regular expression pattern if needed
        if filter_artists:
            pattern = re.compile(r"http://mp3-2003\.computer-legacy\.com/artists/\d+/.+\.html")

        # Use list comprehension for URL extraction
        urls = {
            urljoin(url, a_tag['href'])
            for a_tag in soup.find_all('a', href=True)
            if not filter_artists or (filter_artists and pattern.match(a_tag['href']))
        }

    return list(urls)

In [4]:
website_url = "http://mp3-2003.computer-legacy.com/artists/browse-09.html"
stmt_original = "extract_urls(website_url, True)"
setup_original = "from __main__ import extract_urls, website_url"

execution_time_original = timeit.timeit(stmt_original, setup_original, number=100)

print(f"Execution time for original code: {execution_time_original} seconds")

# Cell 4: Measure Execution Time for Optimized Code

stmt_optimized = "extract_urls_optimized(website_url, True)"
setup_optimized = "from __main__ import extract_urls_optimized, website_url"

execution_time_optimized = timeit.timeit(stmt_optimized, setup_optimized, number=100)

print(f"Execution time for optimized code: {execution_time_optimized} seconds")

Execution time for original code: 242.51905584000002 seconds
Execution time for optimized code: 235.11716856099997 seconds
