# Anime Websites:
1. Kitso - https://kitsu.app/explore/anime
2. My Animel List (top anime airing) - https://myanimelist.net/topanime.php?type=airing
3. Anilist
4. Ranker (many lists; this one is best anime airing right now) https://www.ranker.com/list/best-current-anime-airing-now/ranker-anime
5. **My Anime List** (top anime in general) https://myanimelist.net/topanime.php




# Anime Top 10 List Code

# Instructions
1. **Everything you need is here on colab**.

2. Run (**hit the play button**) the code that starts with '!pip'. It will install **rapidfuzz** (a python library) onto your system that's needed in order for the anime list code to work. You only have to do this once. Every other time you run this code in this colab you can just run the anime code and won't need to run '!pip'

3. Run the code for the anime list (**the really long piece of code; the second code block in this colab**).

4. If it works correctly, you'll get a top 10 list as your output

5.  Run the last code block (third one). It will save the code output (anime list) as a 'csv' file to your computer so that you will have the list for every instance that you run it. Ex: 'top_10_grouped_anime.csv'

# Potential Errors
1. If you experience any error dealing with the library rapidfuzz, just run the '!pip install rapidfuzz' line of code again and then the anime codeblock.



In [None]:
# RUN THIS CODE FIRST; you only have to do it once for the first time and then rapidfuzz will be installed in your system for good
# If it's already downloaded, you're good to go
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/3.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m2.6/3.1 MB[0m [31m35.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m32.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.0


# Mode Code for Anime

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from rapidfuzz import fuzz

# Function to clean and normalize titles for comparison
def normalize_title(title):
    # Lowercase, remove special characters, and strip whitespace
    title = title.lower()
    title = re.sub(r'[^a-z0-9]', '', title)
    return title

# Function to get top 10 anime from MyAnimeList
def get_top_ten_anime():
    url = 'https://myanimelist.net/topanime.php'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    anime_list = soup.find_all('tr', class_='ranking-list')[:10]

    top_anime = []
    for anime in anime_list:
        title = anime.find('h3').text.strip()
        top_anime.append({'Source': 'MyAnimeList', 'Title': title})
    return top_anime

# Function to get top 10 airing anime from MyAnimeList
def get_top_ten_anime_airing():
    url = 'https://myanimelist.net/topanime.php?type=airing'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    anime_list = soup.find_all('tr', class_='ranking-list')[:10]

    top_anime = []
    for anime in anime_list:
        title = anime.find('h3').text.strip()
        top_anime.append({'Source': 'MyAnimeList (Airing)', 'Title': title})
    return top_anime

# Function to get top trending anime from Kitsu
def get_kitsu_top_current_anime():
    url = 'https://kitsu.io/api/edge/trending/anime'
    response = requests.get(url)
    data = response.json()['data']

    top_current_anime = []
    for i, anime in enumerate(data[:10]):
        title = anime['attributes']['canonicalTitle']
        top_current_anime.append({'Source': 'Kitsu', 'Title': title})
    return top_current_anime

# Function to get top 10 anime from AniList
def get_anilist_top_10():
    query = '''
    query ($page: Int, $perPage: Int) {
        Page(page: $page, perPage: $perPage) {
            media(sort: SCORE_DESC, type: ANIME, status: FINISHED) {
                title {
                    romaji
                    english
                }
                averageScore
                popularity
            }
        }
    }
    '''
    variables = {'page': 1, 'perPage': 10}
    url = 'https://graphql.anilist.co'
    response = requests.post(url, json={'query': query, 'variables': variables})

    if response.status_code == 200:
        data = response.json()['data']['Page']['media']
        top_anilist = []
        for anime in data:
            title = anime['title']['english'] or anime['title']['romaji']
            top_anilist.append({'Source': 'AniList', 'Title': title})
        return top_anilist
    else:
        raise Exception(f"Failed to fetch data from AniList API. Status Code: {response.status_code}")

# Function to get top 6 anime from Ranker
def get_ranker_top_6():
    url = 'https://www.ranker.com/list/best-current-anime-airing-now/ranker-anime'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    list_items = soup.find_all('li', role='listitem')

    top_ranker = []
    for item in list_items[:6]:
        title_tag_a = item.find('a', class_='NodeNameUI_main__tvvXB')
        title = title_tag_a['title'] if title_tag_a else item.find('h2', class_='NodeNameUI_main__tvvXB').get_text(strip=True)
        top_ranker.append({'Source': 'Ranker', 'Title': title})
    return top_ranker

# Function to combine all lists and normalize titles
def combine_anime_lists():
    # Gather all titles from different sources
    top_anime_list = []
    top_anime_list.extend(get_top_ten_anime())
    top_anime_list.extend(get_top_ten_anime_airing())
    top_anime_list.extend(get_kitsu_top_current_anime())
    top_anime_list.extend(get_anilist_top_10())
    top_anime_list.extend(get_ranker_top_6())

    # Normalize titles for comparison
    for anime in top_anime_list:
        anime['NormalizedTitle'] = normalize_title(anime['Title'])

    return top_anime_list

# Function to group similar titles based on fuzzy matching and substring matching
def group_similar_titles(anime_list, threshold=85):
    grouped_anime = []
    grouped_titles = []

    for anime in anime_list:
        title = anime['Title']
        normalized_title = anime['NormalizedTitle']

        found_match = False

        for group in grouped_anime:
            # Use both fuzzy ratio and check if the normalized title is a substring
            if (fuzz.ratio(normalized_title, group['NormalizedTitle']) >= threshold or
                normalized_title in group['NormalizedTitle'] or
                group['NormalizedTitle'] in normalized_title):
                group['Count'] += 1
                group['Sources'].append(anime['Source'])
                found_match = True
                break

        if not found_match:
            grouped_anime.append({
                'Title': title,
                'NormalizedTitle': normalized_title,
                'Count': 1,
                'Sources': [anime['Source']]
            })

    # Sort the list by frequency count in descending order
    grouped_anime.sort(key=lambda x: x['Count'], reverse=True)

    # Ensure that only one title per group of similar titles is included in the top 10
    final_grouped_anime = []
    used_normalized_titles = set()

    for anime in grouped_anime:
        if anime['NormalizedTitle'] not in used_normalized_titles:
            final_grouped_anime.append(anime)
            used_normalized_titles.add(anime['NormalizedTitle'])
            # Stop once we have 10 unique titles
            if len(final_grouped_anime) == 10:
                break

    return final_grouped_anime

# Get combined anime list
anime_list = combine_anime_lists()

# Group similar titles and get the top 10 most frequent ones without duplicates
top_10_anime = group_similar_titles(anime_list)

# Top 10 results
df_top_10 = pd.DataFrame(top_10_anime)
print(df_top_10)

# save the result as a csv on colab
df_top_10.to_csv('top_10_grouped_anime.csv', index=False)


                                     Title                  NormalizedTitle  \
0                                 Gintama°                          gintama   
1         Fullmetal Alchemist: Brotherhood    fullmetalalchemistbrotherhood   
2                  Boku no Hero Academia 2              bokunoheroacademia2   
3  Bleach: Sennen Kessen-hen - Soukoku-tan  bleachsennenkessenhensoukokutan   
4                              Steins;Gate                       steinsgate   
5                   Hunter x Hunter (2011)                hunterxhunter2011   
6                                One Piece                         onepiece   
7                                 Dandadan                         dandadan   
8                          Attack on Titan                    attackontitan   
9                        Sousou no Frieren                  sousounofrieren   

   Count                                            Sources  
0      6  [MyAnimeList, MyAnimeList, MyAnimeList, MyAnim...  
1     

In [None]:
# Saves the csv file from colab to your local device (i.e. laptop, computer, phone,)
from google.colab import files
files.download('top_10_grouped_anime.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Averaging Code for Anime

In [None]:
#Average Version
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from rapidfuzz import fuzz

# Function to clean and normalize titles for comparison
def normalize_title(title):
    # Lowercase, remove special characters, and strip whitespace
    title = title.lower()
    title = re.sub(r'[^a-z0-9]', '', title)
    return title

# Function to get top 10 anime from MyAnimeList
def get_top_ten_anime():
    url = 'https://myanimelist.net/topanime.php'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    anime_list = soup.find_all('tr', class_='ranking-list')[:10]

    top_anime = []
    for rank, anime in enumerate(anime_list, start=1):
        title = anime.find('h3').text.strip()
        weight = 11 - rank  # 1st place gets weight 10, 10th place gets weight 1
        top_anime.append({'Source': 'MyAnimeList', 'Title': title, 'Rank': rank, 'Weight': weight})
    return top_anime

# Function to get top 10 airing anime from MyAnimeList
def get_top_ten_anime_airing():
    url = 'https://myanimelist.net/topanime.php?type=airing'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    anime_list = soup.find_all('tr', class_='ranking-list')[:10]

    top_anime = []
    for rank, anime in enumerate(anime_list, start=1):
        title = anime.find('h3').text.strip()
        weight = 11 - rank
        top_anime.append({'Source': 'MyAnimeList (Airing)', 'Title': title, 'Rank': rank, 'Weight': weight})
    return top_anime

# Function to get top trending anime from Kitsu
def get_kitsu_top_current_anime():
    url = 'https://kitsu.io/api/edge/trending/anime'
    response = requests.get(url)
    data = response.json()['data']

    top_current_anime = []
    for rank, anime in enumerate(data[:10], start=1):
        title = anime['attributes']['canonicalTitle']
        weight = 11 - rank
        top_current_anime.append({'Source': 'Kitsu', 'Title': title, 'Rank': rank, 'Weight': weight})
    return top_current_anime

# Function to get top 10 anime from AniList
def get_anilist_top_10():
    query = '''
    query ($page: Int, $perPage: Int) {
        Page(page: $page, perPage: $perPage) {
            media(sort: SCORE_DESC, type: ANIME, status: FINISHED) {
                title {
                    romaji
                    english
                }
                averageScore
                popularity
            }
        }
    }
    '''
    variables = {'page': 1, 'perPage': 10}
    url = 'https://graphql.anilist.co'
    response = requests.post(url, json={'query': query, 'variables': variables})

    if response.status_code == 200:
        data = response.json()['data']['Page']['media']
        top_anilist = []
        for rank, anime in enumerate(data, start=1):
            title = anime['title']['english'] or anime['title']['romaji']
            weight = 11 - rank
            top_anilist.append({'Source': 'AniList', 'Title': title, 'Rank': rank, 'Weight': weight})
        return top_anilist
    else:
        raise Exception(f"Failed to fetch data from AniList API. Status Code: {response.status_code}")

# Function to get top 6 anime from Ranker
def get_ranker_top_6():
    url = 'https://www.ranker.com/list/best-current-anime-airing-now/ranker-anime'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    list_items = soup.find_all('li', role='listitem')

    top_ranker = []
    for rank, item in enumerate(list_items[:6], start=1):
        title_tag_a = item.find('a', class_='NodeNameUI_main__tvvXB')
        title = title_tag_a['title'] if title_tag_a else item.find('h2', class_='NodeNameUI_main__tvvXB').get_text(strip=True)
        weight = 11 - rank
        top_ranker.append({'Source': 'Ranker', 'Title': title, 'Rank': rank, 'Weight': weight})
    return top_ranker

# Function to combine all lists and normalize titles
def combine_anime_lists():
    # Gather all titles from different sources
    top_anime_list = []
    top_anime_list.extend(get_top_ten_anime())
    top_anime_list.extend(get_top_ten_anime_airing())
    top_anime_list.extend(get_kitsu_top_current_anime())
    top_anime_list.extend(get_anilist_top_10())
    top_anime_list.extend(get_ranker_top_6())

    # Normalize titles for comparison
    for anime in top_anime_list:
        anime['NormalizedTitle'] = normalize_title(anime['Title'])

    return top_anime_list

# Function to group similar titles based on fuzzy matching and rank weighting
def group_similar_titles(anime_list, threshold=85):
    grouped_anime = []

    for anime in anime_list:
        title = anime['Title']
        normalized_title = anime['NormalizedTitle']
        weight = anime['Weight']

        found_match = False

        for group in grouped_anime:
            # Use both fuzzy ratio and check if the normalized title is a substring
            if (fuzz.ratio(normalized_title, group['NormalizedTitle']) >= threshold or
                normalized_title in group['NormalizedTitle'] or
                group['NormalizedTitle'] in normalized_title):
                group['Weight'] += weight
                group['Sources'].append(anime['Source'])
                found_match = True
                break

        if not found_match:
            grouped_anime.append({
                'Title': title,
                'NormalizedTitle': normalized_title,
                'Weight': weight,
                'Sources': [anime['Source']]
            })

    # Sort the list by weight (higher ranks) in descending order
    grouped_anime.sort(key=lambda x: x['Weight'], reverse=True)

    # Ensure that only one title per group of similar titles is included in the top 10
    final_grouped_anime = []
    used_normalized_titles = set()

    for anime in grouped_anime:
        if anime['NormalizedTitle'] not in used_normalized_titles:
            final_grouped_anime.append(anime)
            used_normalized_titles.add(anime['NormalizedTitle'])
            # Stop once we have 10 unique titles
            if len(final_grouped_anime) == 10:
                break

    return final_grouped_anime

# Get combined anime list
anime_list = combine_anime_lists()

# Group similar titles and get the top 10 most frequent ones without duplicates
top_10_anime = group_similar_titles(anime_list)

# Top 10 results
df_top_10 = pd.DataFrame(top_10_anime)
print(df_top_10)

# save the result as a csv on colab
df_top_10.to_csv('top_10_grouped_anime.csv', index=False)

                                     Title                   NormalizedTitle  \
0                                 Gintama°                           gintama   
1                  Boku no Hero Academia 2               bokunoheroacademia2   
2  Bleach: Sennen Kessen-hen - Soukoku-tan   bleachsennenkessenhensoukokutan   
3         Fullmetal Alchemist: Brotherhood     fullmetalalchemistbrotherhood   
4                          Attack on Titan                     attackontitan   
5                                One Piece                          onepiece   
6                        Sousou no Frieren                   sousounofrieren   
7            Frieren: Beyond Journey’s End          frierenbeyondjourneysend   
8          Bleach: Thousand-Year Blood War        bleachthousandyearbloodwar   
9  Monogatari Series: Off & Monster Season  monogatariseriesoffmonsterseason   

   Weight                                            Sources  
0      30  [MyAnimeList, MyAnimeList, MyAnimeList, MyAni

In [None]:
# Saves the csv file from colab to your local device (i.e. laptop, computer, phone,)
from google.colab import files
files.download('top_10_grouped_anime.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Video Game Top 10 Code

Scraping of IGN Review Ratings

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.ign.com/reviews/games"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract titles and ratings
    titles = soup.find_all('span', class_='interface jsx-326752785 jsx-4254096439 item-title bold')
    rating_wrappers = soup.find_all('span', class_='hexagon-content-wrapper')

    # Extract ratings from the <figcaption> tags inside the rating wrappers
    ratings = [wrapper.find('figcaption').text.strip() for wrapper in rating_wrappers if wrapper.find('figcaption')]

    # Ensure ratings and titles are aligned
    ratings = ratings[:len(titles)]

    print(f"Number of titles found: {len(titles)}")
    print(f"Number of ratings found: {len(ratings)}")

    # Prepare data for DataFrame
    data = []
    for title, rating in zip(titles[:10], ratings[1:11]):  # Limit both to 10
        title_text = title.text.strip().split(" Review")[0]  # Remove " Review" and everything after
        rating_text = rating.strip()
        data.append({'Title': title_text, 'Rating': rating_text})

    # Create DataFrame
    ign = pd.DataFrame(data)

    # Print the DataFrame
    print("Top 10 Reviews and Ratings DataFrame:")
    print(ign)

else:
    print("Failed to retrieve the page. Status code:", response.status_code)


Number of titles found: 10
Number of ratings found: 10
Top 10 Reviews and Ratings DataFrame:
                              Title Rating
0           Kong: Survivor Instinct      5
1     A Quiet Place: The Road Ahead      7
2        Sonic X Shadow Generations      9
3  Starship Troopers: Extermination      6
4              Unknown 9: Awakening      5
5              MechWarrior 5: Clans      8
6        Super Mario Party Jamboree      9
7       Dragon Ball: Sparking! Zero      7
8                        Until Dawn      5


Scraping for OpenCritic reviews. Scores has been converted from int to float to match scoring format of the other websites. For example, a score of 95 is converted to 9.5.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Base URL for pagination
base_url = "https://opencritic.com/browse/all/last90/date?page={}"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_opencritic(page_number, headers):
    """
    Scrapes game titles and ratings from OpenCritic for a given page number.

    Args:
        page_number (int): The page number to scrape.
        headers (dict): The headers to use for the HTTP request.

    Returns:
        pd.DataFrame: A DataFrame containing game titles and ratings for the page.
    """
    try:
        url = base_url.format(page_number)
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        titles = soup.find_all('div', class_='game-name col ml-2')
        ratings = soup.find_all('div', class_='inner-orb small-orb')

        if len(ratings) > len(titles):
            ratings = ratings[:len(titles)]

        top_ten = []
        for title, rating in zip(titles, ratings):
            title_text = title.text.strip()
            rating_text = rating.text.strip()

            title_text = re.sub(r'\s*\(\d{4}\)', '', title_text)

            if rating_text.isdigit():
                rating_value = float(rating_text) / 10
                top_ten.append((title_text, rating_value))

        if top_ten:
            return pd.DataFrame(top_ten, columns=['Title', 'Rating'])
        else:
            print(f"No valid ratings found on page {page_number}.")
            return pd.DataFrame(columns=['Title', 'Rating'])

    except requests.exceptions.RequestException as e:
        print("An error occurred:", e)
        return pd.DataFrame(columns=['Title', 'Rating'])

# Scrape multiple pages and combine results
opencritic_df = pd.DataFrame()

for page in range(1, 5):  # Adjust the range as needed for more pages
    print(f"Scraping page {page}...")
    page_df = scrape_opencritic(page, headers)
    opencritic_df = pd.concat([opencritic_df, page_df], ignore_index=True)
    time.sleep(2)  # Sleep to be polite to the server

# Check the combined DataFrame
if not opencritic_df.empty:
    print("Combined Reviews and Ratings DataFrame:")
    print(opencritic_df)
    # Save to CSV if needed
    opencritic_df.to_csv('opencritic_reviews_combined.csv', index=False)
else:
    print("No data found across all pages.")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Combined Reviews and Ratings DataFrame:
                                              Title  Rating
0                           Kong: Survivor Instinct     6.6
1                        Sonic X Shadow Generations     7.9
2                                Fear The Spotlight     8.1
3   Teenage Mutant Ninja Turtles: Mutants Unleashed     7.5
4                        Super Mario Party Jamboree     8.1
5                              MechWarrior 5: Clans     7.9
6                         Killing Time: Resurrected     8.5
7                              Unknown 9: Awakening     6.4
8                                RetroRealms Arcade     6.9
9                           Arizona Sunshine Remake     7.8
10                    A Quiet Place: The Road Ahead     6.6
11                                             Neva     8.7
12                    Nikoderiko: The Magical World     7.5
13                          Just Dance 2025 

Scraping for Gamespot reviews

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the base URL and headers
base_url = "https://www.gamespot.com/games/reviews/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Function to scrape a given page
def scrape_page(url):
    # Send GET request
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract titles from different formats
        first_title = soup.find('h2')  # First title
        additional_titles = soup.find_all('h3', class_='media-title vertical-spacing-small-top-rem')
        original_titles = soup.find_all('h4', class_='card-item__title')

        # Extract all ratings
        all_ratings = soup.find_all('div', class_='review-ring-score__score text-bold')

        # Prepare the titles list, skipping "Latest Reviews"
        titles = []
        if first_title and "Latest Reviews" not in first_title.text:  # Check first title
            titles.append(first_title)
        for title in additional_titles + original_titles:  # Combine titles
            title_text = title.text.strip()
            if "Latest Reviews" not in title_text:  # Skip "Latest Reviews"
                titles.append(title)

        # Collect up to top titles and ratings, removing "Review" and after
        top_ten = []
        for title, rating in zip(titles, all_ratings):
            if title is not None:  # Check for None
                # Remove "Review" and everything after it
                title_text = title.text.split('Review')[0].strip().replace("Remake", "").strip()
                rating_text = rating.text.strip()
                if title_text:  # Ensure title is not empty
                    top_ten.append((title_text, rating_text))

        return top_ten
    else:
        print(f"Failed to retrieve the page {url}. Status code:", response.status_code)
        return []

# Scrape both pages
first_page_data = scrape_page(base_url)
second_page_data = scrape_page(base_url + "?page=2")

# Combine data from both pages
all_data = first_page_data + second_page_data

# Create a DataFrame
gamespot = pd.DataFrame(all_data, columns=['Title', 'Rating'])

# Output the DataFrame
print("Top Reviews and Ratings DataFrame:")
print(gamespot)


Top Reviews and Ratings DataFrame:
                                    Title Rating
0           A Quiet Place: The Road Ahead      6
1                      Fear The Spotlight      7
2              Sonic X Shadow Generations      6
3                             RetroRealms      8
4           Mortal Kombat 1: Khaos Reigns      6
5              Super Mario Party Jamboree      6
6                   Backyard Baseball '97      8
7              Dragon Ball: Sparking Zero      6
8              Diablo 4: Vessel Of Hatred      8
9                           Silent Hill 2      9
10                   Metaphor: ReFantazio     10
11                        EA Sports FC 25      6
12                           Funko Fusion      4
13                    God Of War Ragnarok      9
14  The Legend Of Zelda: Echoes Of Wisdom      9
15                                 UFO 50      9
16            Dead Rising Deluxe Remaster      7
17                            Frostpunk 2      8
18                      The Plucky

Scraping of Metacritic reviews


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL and headers
url = "https://www.metacritic.com/game/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Send GET request
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract game titles
    titles = soup.find_all('h3', class_='c-globalProductCard_title g-color-gray80 g-text-xsmall')

    # Extract ratings (both green and yellow)
    ratings = soup.find_all('div', class_='c-siteReviewScore')

    # Filter ratings for green and yellow
    valid_ratings = [rating for rating in ratings if 'c-siteReviewScore_green' in rating['class'] or 'c-siteReviewScore_yellow' in rating['class']]

    # Ensure the number of ratings matches the number of titles
    if len(valid_ratings) > len(titles):
        valid_ratings = valid_ratings[:len(titles)]

    # Print the count of titles and ratings found
    print(f"Number of titles found: {len(titles)}")
    print(f"Number of ratings found: {len(valid_ratings)}")

    # Collect top ten titles and ratings
    top_ten = []
    for title, rating in zip(titles[:30], valid_ratings[:30]):
        title_text = title.text.strip()
        rating_text = rating.find('span').text.strip()  # Get the rating score from the <span>
        top_ten.append((title_text, rating_text))

    # Create a DataFrame
    metacritic = pd.DataFrame(top_ten, columns=['Title', 'Rating'])
    metacritic['Rating'] = metacritic['Rating'].astype(float) / 10

    # Output the DataFrame
    print("Top 10 Reviews and Ratings DataFrame:")
    print(metacritic)

else:
    print("Failed to retrieve the page. Status code:", response.status_code)

Number of titles found: 59
Number of ratings found: 59
Top 10 Reviews and Ratings DataFrame:
                                   Title  Rating
0                     Fear the Spotlight     8.1
1                   Unknown 9: Awakening     6.1
2                              Citadelum     6.8
3                Arizona Sunshine Remake     7.7
4             Super Mario Party Jamboree     8.2
5                   MechWarrior 5: Clans     8.0
6                               9 R.I.P.     8.3
7                Just Dance 2025 Edition     7.1
8                                   Neva     8.7
9          Nikoderiko: The Magical World     8.1
10                   New World: Aeternum     8.0
11         Transformers: Galactic Trials     6.4
12      Starship Troopers: Extermination     7.7
13                                Europa     7.1
14                            Undisputed     9.4
15                  Metaphor: ReFantazio     8.1
16           Dragon Ball: Sparking! Zero     8.6
17                       

Attempt to scrape ScreenRant reviews and ratings

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# User-Agent header to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Base URL for the game reviews
base_url = 'https://screenrant.com/game-reviews/'

game_reviews = []

# Loop through a range of pages
for page in range(1, 4):  # Change 4 to the number of pages you want to scrape
    print(f"Fetching page {page}...")
    response = requests.get(f"{base_url}{page}/", headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all the game reviews
        reviews = soup.find_all('h5', class_='display-card-title')

        for review in reviews:
            title_tag = review.find('a')
            title = title_tag.get_text(strip=True)
            review_link = 'https://screenrant.com' + title_tag['href']

            try:
                review_response = requests.get(review_link, headers=headers)
                review_response.raise_for_status()

                review_soup = BeautifulSoup(review_response.content, 'html.parser')

                # Find the Screen Rant rating
                rating_div = review_soup.find('div', class_='w-rating')
                if rating_div:
                    screenrant_rating = rating_div.find('div', class_='rate-number')
                    if screenrant_rating:
                        rating_number = screenrant_rating.get_text(strip=True).split('/')[0]
                        rating_text = rating_number
                    else:
                        print(f"Screen Rant rating not found for {title}. Skipping...")
                        continue  # Skip if rating not found
                else:
                    print(f"Rating section not found for {title}. Skipping...")
                    continue  # Skip if rating section not found

                title = title.split("Review")[0].strip()

                # Remove the word "Remake" from the title
                title = title.replace("Remake", "").strip()

                game_reviews.append({'Title': title, 'Rating': rating_text})

                time.sleep(1)  # Be polite to the server
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch {review_link}: {e}")
    else:
        print(f"Failed to fetch page {page}.")

# Create a DataFrame from the results
screenrant = pd.DataFrame(game_reviews)

# Check for ratings that are '0' and print titles
problematic_titles = screenrant[screenrant['Rating'] == '0']
if not problematic_titles.empty:
    print("Titles with a rating of 0:")
    print(problematic_titles)

# Print the DataFrame
print(screenrant)

Fetching page 1...
Rating section not found for Metaphor: ReFantazio Preview: Combining The Best Of SMT & Persona Into The Next Big JRPG. Skipping...
Fetching page 2...
Rating section not found for Metaphor: ReFantazio Preview: Combining The Best Of SMT & Persona Into The Next Big JRPG. Skipping...
Rating section not found for Dustborn Review: Never Really Delivering On The Promise. Skipping...
Rating section not found for The Crush House Review: A Scintillating Social Sim Fever Dream. Skipping...
Rating section not found for Hidden Through Time 2: Discovery Review - More Like An Expansion. Skipping...
Rating section not found for Eden Genesis Review: Unnecessarily Cumbersome But With A Smart Challenge. Skipping...
Screen Rant rating not found for Star Wars: Bounty Hunter Review - Jango Fett Might Not Be Worth The Remaster. Skipping...
Rating section not found for I Think The Razer Huntsman V3 Pro's Snap Tap Actually Made Me A Better Gamer. Skipping...
Fetching page 3...
Rating section

In [None]:
print("top IGN newest reviews")
print(ign)
print(" ")
print("top Opencritic newest reviews")
print(opencritic_df)
print(" ")
print("top Gamespot newest reviews")
print(gamespot)
print(" ")
print("top Metacritic newest reviews")
print(metacritic)
print(" ")
print("top ScreenRant newest reviews")
print(screenrant)

top IGN newest reviews
                              Title Rating
0           Kong: Survivor Instinct      5
1     A Quiet Place: The Road Ahead      7
2        Sonic X Shadow Generations      9
3  Starship Troopers: Extermination      6
4              Unknown 9: Awakening      5
5              MechWarrior 5: Clans      8
6        Super Mario Party Jamboree      9
7       Dragon Ball: Sparking! Zero      7
8                        Until Dawn      5
 
top Opencritic newest reviews
                                              Title  Rating
0                           Kong: Survivor Instinct     6.6
1                        Sonic X Shadow Generations     7.9
2                                Fear The Spotlight     8.1
3   Teenage Mutant Ninja Turtles: Mutants Unleashed     7.5
4                        Super Mario Party Jamboree     8.1
5                              MechWarrior 5: Clans     7.9
6                         Killing Time: Resurrected     8.5
7                              Unkn

Attempt at implementing a function that singles out titles at appear in all four dataframes and takes the average rating. (Having trouble getting consistent expansion of the list between the dataframes. Could be a HTML issue. The function below should work as all it does is the merge the rows from all four dataframes and picks out the titles that exists in all four dataframes)

In [None]:
import pandas as pd

def average_common_ratings(ign, opencritic, gamespot, metacritic, screenrant):
    # Check if required columns are present
    required_columns = ['Title', 'Rating']
    for df in [ign, opencritic, gamespot, metacritic, screenrant]:
        if not all(col in df.columns for col in required_columns):
            raise ValueError("All DataFrames must contain 'Title' and 'Rating' columns.")

    # Merge the DataFrames on 'Title' using inner joins to keep only common titles
    merged_df = (ign.merge(opencritic, on='Title', how='inner', suffixes=('_ign', '_opencritic'))
                  .merge(gamespot, on='Title', how='inner', suffixes=('', '_gamespot'))
                  .merge(metacritic, on='Title', how='inner', suffixes=('', '_metacritic'))
                  .merge(screenrant, on='Title', how='inner', suffixes=('', '_screenrant')))

    # Rename 'Rating' column from gamespot and screenrant for consistency
    merged_df = merged_df.rename(columns={'Rating': 'Rating_gamespot', 'Rating_screenrant': 'Rating_screenrant'})

    # List of rating columns
    rating_columns = ['Rating_ign', 'Rating_opencritic', 'Rating_gamespot', 'Rating_metacritic', 'Rating_screenrant']

    # Convert rating columns to numeric, handling errors by coercing to NaN
    for col in rating_columns:
        merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

    # Calculate average rating
    merged_df['Average_Rating'] = merged_df[rating_columns].mean(axis=1, skipna=True)

    # Filter for valid average ratings
    result = merged_df[merged_df['Average_Rating'].notnull()][['Title', 'Average_Rating']]

    return result

# Example usage:
# average_ratings = average_common_ratings(ign, opencritic, gamespot, metacritic, screenrant)
# print(average_ratings)


In [None]:
average_ratings = average_common_ratings(ign, opencritic_df, gamespot, metacritic, screenrant)
print(average_ratings)

Empty DataFrame
Columns: [Title, Average_Rating]
Index: []


Attempt at implementing a function that requires a title to exist in at least 3 of the 4 dataframes. This is a work around for not being able to expand the dataframes at the moment.

In [None]:
import pandas as pd

def average_common_ratings(ign, opencritic, gamespot, metacritic, screenrant):
    # Merge the DataFrames on 'Title' to find common titles with outer joins
    merged_df = (ign.merge(opencritic, on='Title', how='outer', suffixes=('_ign', '_opencritic'))
                  .merge(gamespot, on='Title', how='outer', suffixes=('', '_gamespot'))  # Add suffixes here
                  .merge(metacritic, on='Title', how='outer', suffixes=('', '_metacritic'))  # Add suffixes here
                  .merge(screenrant, on='Title', how='outer', suffixes=('', '_screenrant')))  # Include ScreenRant

    # Rename the 'Rating' column to 'Rating_gamespot' and 'Rating' from ScreenRant for consistency
    merged_df = merged_df.rename(columns={
        'Rating': 'Rating_gamespot',
        'Rating_screenrant': 'Rating_screenrant'
    })

    # List of rating columns
    rating_columns = ['Rating_ign', 'Rating_opencritic', 'Rating_gamespot', 'Rating_metacritic', 'Rating_screenrant']

    # Convert rating columns to numeric, handling errors by coercing to NaN
    for col in rating_columns:
        merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

    # Count the number of non-null ratings
    rating_counts = merged_df[rating_columns].notnull().sum(axis=1)

    # Filter for titles with at least three ratings
    merged_df['Average_Rating'] = merged_df[rating_columns].mean(axis=1, skipna=True)
    result = merged_df[(rating_counts >= 3) & (merged_df['Average_Rating'].notnull())][['Title', 'Average_Rating']]

    # Sort the result by 'Average_Rating' in descending order
    result = result.sort_values(by='Average_Rating', ascending=False)

    return result

# Example usage:
# average_ratings = average_common_ratings(ign, opencritic, gamespot, metacritic, screenrant)
# print(average_ratings)


In [None]:
average_ratings = average_common_ratings(ign, opencritic_df, gamespot, metacritic, screenrant)
print(average_ratings)

                             Title  Average_Rating
59            Metaphor: ReFantazio        9.075000
64                            Neva        8.800000
76                   Silent Hill 2        8.250000
23     Dragon Ball: Sparking! Zero        8.200000
58            MechWarrior 5: Clans        7.966667
90      Super Mario Party Jamboree        7.825000
79      Sonic X Shadow Generations        7.633333
1    A Quiet Place: The Road Ahead        6.533333
109           Unknown 9: Awakening        5.833333
