In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

In [2]:
# URL of the website to scrape
url = "https://www.vgchartz.com/games/games.php?page=1"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the <select> tag with name="console"
    console_select = soup.find('select', {'name': 'console'})
    
    if console_select:
        # Extract the options and their values
        options = console_select.find_all('option')
        
        # Initialize lists to store data
        options_data = []
        
        # Loop through the options and extract the text and values
        for option in options:
            text = option.text.strip()
            value = option['value']
            options_data.append([text, value])
        
        # Create a Pandas DataFrame for the options
        genre_df = pd.DataFrame(options_data, columns=["Console Name", "Console Value"])
        
        # Display the DataFrame
    else:
        print("Select tag with name 'console' not found on the page.")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

In [3]:
genre_df = genre_df.drop(0)
genre_df.head()

Unnamed: 0,Console Name,Console Value
1,3DO Interactive Multiplayer,3DO
2,Acorn Electron,Aco
3,All,All
4,Amiga,Amig
5,Amiga CD32,CD32


In [4]:
combined_df = pd.DataFrame()

In [5]:
for page_num in range(1, 6):
    # Construct the URL with the current page number
    print(f"Working on Page {page_num}")
    url = f"https://www.vgchartz.com/games/games.php?page={page_num}&results=200&order=Sales&ownership=Both&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1&showmultiplat=Yes"
    
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the div element with ID "generalBody"
        general_body_div = soup.find('div', {'id': 'generalBody'})
        
        if general_body_div:
            # Find the table inside the "generalBody" div
            table = general_body_div.find('table')
            
            if table:
                # Initialize lists to store data
                rows_data = []
                
                # Loop through each row (tr) in the table
                for row in table.find_all('tr'):
                    # Check if the row has td tags
                    if row.find_all('td'):
                        # Extract the text from each cell (td) in the row
                        row_data = [cell.text.strip() for cell in row.find_all('td')]
                        rows_data.append(row_data)
                
                # Create a Pandas DataFrame with named columns
                columns = [
                    "Position", "Logo","Game", "Console", "Publisher", "Developer",
                    "Total Shipped", "Total Sales", "NA Sales", "PAL Sales",
                    "Japan Sales", "Other Sales", "Release Date", "Last Update"
                ]
                df = pd.DataFrame(rows_data, columns=columns)
                
                # Append the data from the current page to the combined DataFrame
                combined_df = combined_df.append(df, ignore_index=True)
            else:
                print(f"Table not found within the 'generalBody' div on page {page_num}.")
        else:
            print(f"Div with ID 'generalBody' not found on page {page_num}.")
    else:
        print(f"Failed to retrieve the webpage for page {page_num}. Status code:", response.status_code)

Working on Page 1


  combined_df = combined_df.append(df, ignore_index=True)


Working on Page 2


  combined_df = combined_df.append(df, ignore_index=True)


Working on Page 3


  combined_df = combined_df.append(df, ignore_index=True)


Working on Page 4


  combined_df = combined_df.append(df, ignore_index=True)


Working on Page 5


  combined_df = combined_df.append(df, ignore_index=True)


In [6]:
combined_df.head()

Unnamed: 0,Position,Logo,Game,Console,Publisher,Developer,Total Shipped,Total Sales,NA Sales,PAL Sales,Japan Sales,Other Sales,Release Date,Last Update
0,1,,Tetris,,The Tetris Company,Alexey Pajitnov,496.40m,,,,,,01st Jan 88,27th Feb 20
1,2,,Pokemon,,Nintendo,Game Freak,480.66m,,,,,,28th Sep 98,03rd Feb 20
2,3,,Call of Duty,,Activision,Infinity Ward,425.00m,,,,,,29th Oct 03,03rd Feb 20
3,4,,Grand Theft Auto,,Rockstar Games,Rockstar North,405.00m,,,,,,27th Mar 98,03rd Feb 20
4,5,,Super Mario,,Nintendo,Nintendo,402.36m,,,,,,20th Jul 83,20th Feb 20


In [7]:
combined_df = combined_df.drop("Position", axis=1)
combined_df = combined_df.drop("Logo", axis=1)

In [8]:
combined_df.shape

(1000, 12)

In [9]:
combined_df = combined_df.replace("N/A", np.nan)

columns_to_check = ["Total Sales", "NA Sales", "PAL Sales", "Japan Sales", "Other Sales"]

# Remove rows with N/A values in any of the specified columns
combined_df = combined_df.dropna(subset=columns_to_check, how="any")

In [10]:
combined_df.shape

(136, 12)