In [1]:
import random
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

base_url = 'https://www.metacritic.com'
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:131.0) Gecko/20100101 Firefox/131.0',
    'Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
]

In [None]:
def get_total_pages(url):
    response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
    if response.status_code != 200:
        print('Failed to fetch the page:', url)
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    pagination_div = soup.find('div', {'data-testid': 'navigation-pagination'})
    return pagination_div.find_all('span', class_='c-navigationPagination_itemButtonContent')[-2].text.strip()

def fetch_links_from_page(url, page):
    page_url = f'{url}&page={page}'
    response = requests.get(page_url, headers={'User-Agent': random.choice(user_agents)})
    if response.status_code != 200:
        print('Failed to fetch the page:', page_url)
        return []
    soup = BeautifulSoup(response.text, 'html.parser')
    _links = [div.find('a')['href'] for div in soup.find_all('div', {'data-testid': 'filter-results'})]
    return [base_url + link for link in _links]

def get_show_links(url, total_pages):
    links = []
    with ThreadPoolExecutor(max_workers = 10) as executor:
        future_to_page = {executor.submit(fetch_links_from_page, url, page): page for page in range(1, int(total_pages)+1)}
        for future in as_completed(future_to_page):
            page = future_to_page[future]
            try:
                page_links = future.result()
                links.extend(page_links)
            except Exception as exc:
                print(f'Page {page} generated an exception: {exc}')
    return links

In [10]:
url = f'{base_url}/browse/tv'
total_pages = get_total_pages(url)
print('Total pages:', total_pages)

url = f'{base_url}/browse/tv?releaseYearMin=1910&releaseYearMax=2024'
links = get_show_links(url, total_pages)
print("Total links:", len(links))

filename = 'metacritic_links.txt'
with open(filename, 'w') as f:
    f.write("\n".join(links))
print("Links saved to", filename)

Total pages: 131
Total links: 3138
Links saved to metacritic_links.txt
