In [10]:
import re
import csv
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

base_url = 'https://www.metacritic.com'

In [21]:
def load_user_agents(filename):
    with open(filename, 'r') as file:
        return [line.strip() for line in file.readlines()]

def get_total_pages(url):
    response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
    if response.status_code != 200:
        print('Failed to fetch the page:', url)
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    pagination_div = soup.find('div', {'data-testid': 'navigation-pagination'})
    
    return pagination_div.find_all('span', class_='c-navigationPagination_itemButtonContent')[-2].text.strip()

def clean_title(title):
    title = title.lower()
    title = re.sub(r"[.,'()&*]", '', title)
    title = title.replace(' ', '-')
    return title

def fetch_links_from_page(url, page):
    page_url = f'{url}&page={page}'
    response = requests.get(page_url, headers={'User-Agent': random.choice(user_agents)})
    if response.status_code != 200:
        print('Failed to fetch the page:', page_url)
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    _links = []
    for div in soup.find_all('div', {'data-testid': 'filter-results'}):
        link = div.find('a')['href']
        if not link.startswith('/tv/'):
            _title = div.find('h3', class_='c-finderProductCard_titleHeading')
            if _title:
                spans = _title.find_all('span')
                if len(spans) >= 2:
                    number = spans[0].get_text(strip=True)
                    title = spans[1].get_text(strip=True)
                title_text = number + ' ' + title
                print(f"Error link: {base_url + link} - Title: {title_text}")
                
            date_span = soup.find('span', class_='u-text-uppercase')
            if date_span:
                date_text = date_span.get_text(strip=True)
                year_match = re.search(r'\d{4}', date_text)
                if year_match:
                    year = year_match.group(0)

            alternative_link = {f'/tv/{clean_title(title)}', f'/tv/{clean_title(title)}-{year}'}
            for alt_link in alternative_link:
                print(f"Trying alternative link: {base_url + alt_link}")
                response = requests.get(base_url + alt_link, headers={'User-Agent': random.choice(user_agents)})
                if response.status_code == 200:
                    link = alt_link
                    print(f"Alternative link for {title_text}: {base_url + link}")
                else:
                    print(f"Alternative link failed: {base_url + alt_link}")
        if link.startswith('/tv/'):
            _links.append(link)
        
    return [base_url + link for link in _links]


def get_show_links(url, total_pages):
    links = []
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_page = {executor.submit(fetch_links_from_page, url, page): page for page in range(1, int(total_pages)+1)}
        for future in as_completed(future_to_page):
            page = future_to_page[future]
            try:
                page_links = future.result()
                links.extend(page_links)
            except Exception as exc:
                print(f'Page {page} generated an exception: {exc}')
                
    return links

In [22]:
user_agents = load_user_agents('user_agents.txt')

url = f'{base_url}/browse/tv'
total_pages = get_total_pages(url)
print('Total pages:', total_pages)

url = f'{base_url}/browse/tv?releaseYearMin=1910&releaseYearMax=2024'
links = get_show_links(url, total_pages)
print("Total links:", len(links))

filename = 'metacritic_links.txt'
with open(filename, 'w') as f:
    f.write("\n".join(links))
print("Links saved to", filename)

Total pages: 131
Error link: https://www.metacritic.com/ - Title: 2,017. Noughts + Crosses
Trying alternative link: https://www.metacritic.com/tv/noughts-+-crosses
Alternative link failed: https://www.metacritic.com/tv/noughts-+-crosses
Trying alternative link: https://www.metacritic.com/tv/noughts-+-crosses-2020
Alternative link for 2,017. Noughts + Crosses: https://www.metacritic.com/tv/noughts-+-crosses-2020
Total links: 3138
Links saved to metacritic_links.txt


In [None]:
# import re

# url = 'https://www.metacritic.com/tv/living-undocumented/' # 'https://www.metacritic.com/tv/the-office-uk/'
# response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})

# if response.status_code != 200:
#     print('Failed to fetch the page:', url)
# soup = BeautifulSoup(response.text, 'html.parser')

# title = soup.find('div', class_='c-productHero_title').find('h1').text.strip()
# must_watch = 1 if soup.find('img', class_='c-productScoreInfo_must') else 0
# initial_release_date = soup.find('span', string='Initial Release Date:').find_next('span').get_text(strip=True) if soup.find('span', string='Initial Release Date:') else None
# production_companies = [li.get_text(strip=True) for li in soup.find('span', string='Production Company:').find_next('ul').find_all('li')] if soup.find('span', string='Production Company:') else []
# rating = soup.find('span', string='Rating:').find_next('span').get_text(strip=True) if soup.find('span', string='Rating:') else None
# genres = list(set([genre.get_text(strip=True) for genre in soup.select('.c-genreList_item .c-globalButton_label')])) if soup.select('.c-genreList_item .c-globalButton_label') else None

# # scores = [div.get_text(strip=True) for div in soup.select('.c-productScoreInfo_scoreNumber')[:2]]
# # review_counts = [div.get_text(strip=True) for div in soup.select('.c-productScoreInfo_reviewsTotal span')[:2]]
# # metascore = scores[0] if len(scores) > 0 else None
# # user_score = scores[1] if len(scores) > 1 else None
# # metascore_reviews = re.search(r'\d+', review_counts[0]).group() if len(review_counts) > 0 else None
# # user_score_reviews = re.search(r'\d+', review_counts[1]).group() if len(review_counts) > 1 else None
# # score_list = []
# # if metascore and metascore_reviews:
# #     score_list.append(('Metascore', metascore, f'{metascore_reviews}rv'))
# # if user_score and user_score_reviews:
# #     score_list.append(('Userscore', user_score, f'{user_score_reviews}rv'))
# # if not score_list:
# #     score_list = [('No scores available', 'N/A', 'N/A')]

# score_divs = soup.select('.c-productScoreInfo_scoreContent')
# if score_divs:
#     metascore = score_divs[0].select_one('.c-productScoreInfo_scoreNumber span')
#     metascore = metascore.get_text(strip=True) if metascore else None
#     metascore_reviews = '4*rv' if metascore == 'tbd' else None
#     if metascore and metascore.isdigit() and 0 <= int(metascore) <= 100: metascore = int(metascore)
#     userscore = score_divs[1].select_one('.c-productScoreInfo_scoreNumber span')
#     userscore = float(userscore.get_text(strip=True)) if userscore else None
#     userscore_reviews = re.search(r'(\d+)', score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(1) if score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span') else None
# score_list = [('Metascore', metascore, metascore_reviews) if metascore else None,
#               ('Userscore', str(userscore), f'{userscore_reviews}rv' if userscore_reviews else None)] 
# score_list = [score for score in score_list if score]
# if not score_list:
#     score_list = [('No scores available', 'N/A', 'N/A')]
    
# awards = []
# for award_card in soup.select('.c-productionAwardSummary_award'):
#     award_name = award_card.find('div', class_='g-text-bold').get_text(strip=True)
#     award_details = award_card.find_all('div')[1].get_text(strip=True).replace('•', '').strip()
#     awards.append((award_name, award_details))
    
# number_of_seasons = re.search(r'\d+', soup.find('span', string='Number of seasons:').find_next('span').get_text(strip=True)) if soup.find('span', string='Number of seasons:') else None
# number_of_seasons = number_of_seasons.group(0) if number_of_seasons else None
# seasons = []
# for season_card in soup.select('.c-seasonsModalCard'):
#     season_name = season_card.find('div', class_='g-text-xsmall g-text-bold').get_text(strip=True)
#     episodes_text = season_card.find('div', class_='g-text-xsmall g-text-normal').get_text(strip=True)
#     episodes_count = episodes_text.split()[0]
#     year = episodes_text.split('•')[-1].strip()
#     season_id = f"Ss{int(season_name.split()[-1])}"
#     seasons.append((season_id, f'{episodes_count}eps', year))

# print('Title:', title)
# print('Must watch:', must_watch)
# print("Initial Release Date:", initial_release_date)
# print("Production Companies:", production_companies)
# print("Rating:", rating)
# print("Genres:", genres)
# print("Score:", score_list)
# print("Awards:", awards)
# print("Number of Seasons:", number_of_seasons)
# print("Seasons:", seasons)

In [None]:
# url = 'https://www.metacritic.com/tv/living-undocumented/'
# response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
# if response.status_code != 200:
#     print('Failed to fetch the page:', url)
# soup = BeautifulSoup(response.text, 'html.parser')

# title = soup.find('div', class_='c-productHero_title').find('h1').text.strip()
# must_watch = 1 if soup.find('img', class_='c-productScoreInfo_must') else 0
# initial_release_date = soup.find('span', string='Initial Release Date:').find_next('span').get_text(strip=True) if soup.find('span', string='Initial Release Date:') else None
# production_companies = [li.get_text(strip=True) for li in soup.find('span', string='Production Company:').find_next('ul').find_all('li')] if soup.find('span', string='Production Company:') else []
# rating = soup.find('span', string='Rating:').find_next('span').get_text(strip=True) if soup.find('span', string='Rating:') else None
# genres = list(set([genre.get_text(strip=True) for genre in soup.select('.c-genreList_item .c-globalButton_label')])) if soup.select('.c-genreList_item .c-globalButton_label') else None

# score_divs = soup.select('.c-productScoreInfo_scoreContent')
# metascore, userscore, metascore_reviews, userscore_reviews = None, None, None, None
# if score_divs:
#     metascore = score_divs[0].select_one('.c-productScoreInfo_scoreNumber span')
#     metascore = metascore.get_text(strip=True) if metascore else None
#     metascore_reviews = re.search(r'\d+', score_divs[0].select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(0) if score_divs[0].select_one('.c-productScoreInfo_reviewsTotal span') else None
#     userscore = score_divs[1].select_one('.c-productScoreInfo_scoreNumber span')
#     userscore = userscore.get_text(strip=True) if userscore else None
#     userscore_reviews = re.search(r'(\d+)', score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(1) if score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span') else None

# score_list = [('Metascore', str(metascore), f'{metascore_reviews}rv' if metascore_reviews else None) if metascore else None,
#               ('Userscore', str(userscore), f'{userscore_reviews}rv' if userscore_reviews else None)] 
# score_list = [score for score in score_list if score]
# if not score_list:
#     score_list = [('No scores available', 'N/A', 'N/A')]

# awards = []
# for award_card in soup.select('.c-productionAwardSummary_award'):
#     award_name = award_card.find('div', class_='g-text-bold').get_text(strip=True)
#     award_details = award_card.find_all('div')[1].get_text(strip=True).replace('•', '').strip()
#     awards.append((award_name, award_details))

# number_of_seasons = re.search(r'\d+', soup.find('span', string='Number of seasons:').find_next('span').get_text(strip=True)) if soup.find('span', string='Number of seasons:') else None
# number_of_seasons = number_of_seasons.group(0) if number_of_seasons else None

# seasons = []
# for season_card in soup.select('.c-seasonsModalCard'):
#     season_name = season_card.find('div', class_='g-text-xsmall g-text-bold').get_text(strip=True)
#     episodes_text = season_card.find('div', class_='g-text-xsmall g-text-normal').get_text(strip=True)
#     episodes_count = episodes_text.split()[0]
#     year = episodes_text.split('•')[-1].strip()
#     season_id = f"Ss{int(season_name.split()[-1])}"
#     seasons.append((season_id, f'{episodes_count}eps', year))

# data = {
#     'Title': title,
#     'Must Watch': must_watch,
#     'Initial Release Date': initial_release_date,
#     'Production Companies': production_companies,
#     'Rating': rating,
#     'Genres': genres,
#     'Score': score_list,
#     'Awards': awards,
#     'Number of Seasons': number_of_seasons,
#     'Seasons': seasons
# }

# df = pd.DataFrame([data])
# df.to_csv('metacritic_data.csv', index=False)
# print('Data saved to metacritic_data.csv')


In [None]:
# def load_user_agents(filename):
#     """Load user agents from a file."""
#     with open(filename, 'r') as file:
#         user_agents = [line.strip() for line in file.readlines()]
#     return user_agents

# def fetch_metacritic_data(url, user_agents):
#     response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
#     if response.status_code != 200:
#         print('Failed to fetch the page:', url)
#         return None

#     soup = BeautifulSoup(response.text, 'html.parser')

#     title = soup.find('div', class_='c-productHero_title').find('h1').text.strip()
#     must_watch = 1 if soup.find('img', class_='c-productScoreInfo_must') else 0
#     scores = " - ".join([span.get_text(strip=True) for span in soup.find_all("span")])
#     production_companies = [li.get_text(strip=True) for li in soup.find('span', string='Production Company:').find_next('ul').find_all('li')] if soup.find('span', string='Production Company:') else []
#     initial_release_date = soup.find('span', string='Initial Release Date:').find_next('span').get_text(strip=True) if soup.find('span', string='Initial Release Date:') else None
#     number_of_seasons = soup.find('span', string='Number of seasons:').find_next('span').get_text(strip=True) if soup.find('span', string='Number of seasons:') else None
#     rating = soup.find('span', string='Rating:').find_next('span').get_text(strip=True) if soup.find('span', string='Rating:') else None
#     genres = list(set([genre.get_text(strip=True) for genre in soup.select('.c-genreList_item .c-globalButton_label')])) if soup.select('.c-genreList_item .c-globalButton_label') else None
#     awards = " - ".join([" ".join(award.stripped_strings).replace('•', ':') for award in soup.find_all("div", class_="c-productionAwardSummary_award")])
    
#     # score_divs = soup.select('.c-productScoreInfo_scoreContent')
#     # metascore, userscore, metascore_reviews, userscore_reviews = None, None, None, None
#     # # if score_divs:
#     # #     metascore = score_divs[0].select_one('.c-productScoreInfo_scoreNumber span')
#     # #     metascore = metascore.get_text(strip=True) if metascore else None
#     # #     metascore_reviews = re.search(r'\d+', score_divs[0].select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(0) if score_divs[0].select_one('.c-productScoreInfo_reviewsTotal span') else None
#     # #     userscore = score_divs[1].select_one('.c-productScoreInfo_scoreNumber span')
#     # #     userscore = userscore.get_text(strip=True) if userscore else None
#     # #     userscore_reviews = re.search(r'\d+', score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(1) if score_divs[1].select_one('.c-productScoreInfo_reviewsTotal span') else None
#     # for score_div in score_divs:
#     #     metascore = score_div.select_one('.c-productScoreInfo_scoreNumber span')
#     #     if metascore:
#     #         metascore = metascore.get_text(strip=True)
#     #         metascore_reviews = re.search(r'\d+', score_div.select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(0) if score_div.select_one('.c-productScoreInfo_reviewsTotal span') else None
#     #     userscore = score_div.select_one('.c-productScoreInfo_scoreNumber span')
#     #     if userscore:
#     #         userscore = userscore.get_text(strip=True)
#     #         userscore_reviews = re.search(r'\d+', score_div.select_one('.c-productScoreInfo_reviewsTotal span').get_text(strip=True)).group(1) if score_div.select_one('.c-productScoreInfo_reviewsTotal span') else None

#     # score_list = [('Metascore', str(metascore), f'{metascore_reviews}rv' if metascore_reviews else None) if metascore else None,
#     #               ('Userscore', str(userscore), f'{userscore_reviews}rv' if userscore_reviews else None)] 
#     # score_list = [score for score in score_list if score]
#     # if not score_list:
#     #     score_list = [('No scores available', 'N/A', 'N/A')]

#     # awards = []
#     # for award_card in soup.select('.c-productionAwardSummary_award'):
#     #     award_name = award_card.find('div', class_='g-text-bold').get_text(strip=True)
#     #     award_details = award_card.find_all('div')[1].get_text(strip=True).replace('•', '').strip()
#     #     awards.append((award_name, award_details))
    
    

#     # number_of_seasons = re.search(r'\d+', soup.find('span', string='Number of seasons:').find_next('span').get_text(strip=True)) if soup.find('span', string='Number of seasons:') else None
#     # number_of_seasons = number_of_seasons.group(0) if number_of_seasons else None

#     # seasons = []
#     # for season_card in soup.select('.c-seasonsModalCard'):
#     #     season_name = season_card.find('div', class_='g-text-xsmall g-text-bold').get_text(strip=True)
#     #     episodes_text = season_card.find('div', class_='g-text-xsmall g-text-normal').get_text(strip=True)
#     #     episodes_count = episodes_text.split()[0]
#     #     year = episodes_text.split('•')[-1].strip()
#     #     season_id = f"Ss{int(season_name.split()[-1])}"
#     #     seasons.append((season_id, f'{episodes_count}eps', year))

#     data = {
#         'Title': title,
#         'Must Watch': must_watch,
#         'Scores': scores,
#         'Production Companies': production_companies,
#         'Initial Release Date': initial_release_date,
#         'Number of Seasons': number_of_seasons,
#         'Rating': rating,
#         'Genres': genres,
#         'Awards': awards
#     }
    
#     print(url)
#     return data

# def scrape_links_from_file(filename):
#     """Read URLs from the file."""
#     with open(filename, 'r') as file:
#         links = [line.strip() for line in file.readlines()]
#     return links

# def main():
#     user_agents = load_user_agents('user_agents.txt')
#     links = scrape_links_from_file('links.txt')
#     all_data = []

#     for url in links:
#         data = fetch_metacritic_data(url, user_agents)
#         if data:
#             all_data.append(data)

#     if all_data:
#         df = pd.DataFrame(all_data)
#         df.to_csv('metacritic_data.csv', index=False)
#         print('Data saved to metacritic_data.csv')
#     else:
#         print('No data to save.')

In [None]:
def fetch_metacritic_data(url, user_agents):
    try:
        response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('div', class_='c-productHero_title').find('h1').text.strip() if soup.find('div', class_='c-productHero_title') else None
        must_watch = 1 if soup.find('img', class_='c-productScoreInfo_must') else 0
        _scores = [
            span.get_text(strip=True)
            for span in soup.find('div', class_='c-productHero_scoreInfo').find_all("span")
            if span.get_text(strip=True)
        ]
        scores = [
            _scores[i] for i in range(len(_scores))
            if i == 0 or _scores[i] != _scores[i - 1]
        ]
        production_companies = [
            li.get_text(strip=True)
            for li in soup.find('span', string='Production Company:').find_next('ul').find_all('li')
        ] if soup.find('span', string='Production Company:') else []
        initial_release_date = soup.find('span', string='Initial Release Date:').find_next('span').get_text(strip=True) if soup.find('span', string='Initial Release Date:') else None
        number_of_seasons = soup.find('span', string='Number of seasons:').find_next('span').get_text(strip=True) if soup.find('span', string='Number of seasons:') else None
        rating = soup.find('span', string='Rating:').find_next('span').get_text(strip=True) if soup.find('span', string='Rating:') else None
        genres = list(set([
            genre.get_text(strip=True) for genre in soup.select('.c-genreList_item .c-globalButton_label')
        ])) if soup.select('.c-genreList_item .c-globalButton_label') else None
        awards = [
            [award.find("div", class_="g-text-bold").get_text(strip=True), 
            award.find_all("div")[1].get_text(strip=True).replace('•', '')]
            for award in soup.find_all("div", class_="c-productionAwardSummary_award")
        ]

        return {
            'Title': title,
            'Must Watch': must_watch,
            'Scores': scores,
            'Production Companies': production_companies,
            'Initial Release Date': initial_release_date,
            'Number of Seasons': number_of_seasons,
            'Rating': rating,
            'Genres': genres,
            'Awards': awards,
            'Link': url
        }
    except Exception as e:
        print(f'Failed to fetch data from {url}: {e}')
        return None

def scrape_links_from_file(filename):
    with open(filename, 'r') as file:
        return [line.strip() for line in file.readlines()]

def main():
    user_agents = load_user_agents('user_agents.txt')
    links = scrape_links_from_file('metacritic_links.txt')
    all_data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_metacritic_data, url, user_agents): url for url in links}

        for future in as_completed(futures):
            data = future.result()
            if data:
                all_data.append(data)

    if all_data:
        df = pd.DataFrame(all_data)
        df.to_csv('metacritic_data.csv', index=False)
        print('Data saved to metacritic_data.csv')
    else:
        print('No data to save.')

In [26]:
if __name__ == '__main__':
    main()

Data saved to data.csv
