# Web-scraping for Video Game Prices

In [1]:
# Dependancies
import pandas as pd
import lxml.html as lh
from bs4 import BeautifulSoup as bs
import requests

In [2]:
# URLs to scrape
base_url = "https://gamevaluenow.com/"
console = ["atari-2600",
            "nintendo-nes",
            "sega-genesis",
            "super-nintendo",
            "nintendo-64",
            "sega-cd",
            "sega-saturn",
            "playstation-1-ps1"]
console_col = ["2600",
                "NES",
                "GEN",
                "SNES",
                "N64",
                "SCD",
                "SAT",
                "PS"]

In [3]:
# Put the all the console complete prices data in a list
complete_list = []

for name in range(len(console)):
    all_prices = []
    
    # Retrieve page with the requests module
    response = requests.get(base_url + console[name])
    # Create a Beautiful Soup object
    soup = bs(response.text, 'html.parser')
    
    prices_table = soup.find("table")
    prices_data = prices_table.find_all("tr")
    
    # Get all the price data
    for item in range(len(prices_data)):
        for td in prices_data[item].find_all("td"):
            # Remove all the markup from the text
            all_prices.append(td.text.strip())
        
        all_prices.append(console_col[name])
        # Make a list of the item names from every fifth index eg 1,6,10 et
        game_title = all_prices[1::5]             
        # Make a list of the complete price from starting at the fourth index
        price_complete = all_prices[3::5]
        # Make a list of the console types from every fifth index eg 0,5,9 etc
        console_name = all_prices[5::5] 
        # Make the lists in to a datframe
        game_prices_df = pd.DataFrame({'Console' : console_name, 'Game Title' : game_title, 'Price' : price_complete})
    
    # Create a list of data frames
    complete_list.append(game_prices_df)

In [4]:
game_prices_df

Unnamed: 0,Console,Game Title,Price
0,PS,007 Racing,7.15
1,PS,007 Racing [Collector's Edition],2.86
2,PS,007: The World Is Not Enough,11.09
3,PS,007: The World Is Not Enough [Greatest Hits],6.68
4,PS,007: Tomorrow Never Dies,11.63
...,...,...,...
1552,PS,You Don't Know Jack: Mock 2,5.24
1553,PS,Yu-Gi-Oh! Forbidden Memories,15.19
1554,PS,Zero Divide,24.64
1555,PS,Zoboomafoo,3.71


In [5]:
# Concatinate the list of data frames in to one
game_price_list = pd.concat(complete_list)

In [6]:
game_price_list

Unnamed: 0,Console,Game Title,Price
0,2600,3-D Tic-Tac-Toe [Atari],23.31
1,2600,3-D Tic-Tac-Toe [Sears],83.88
2,2600,The Activision Decathlon,23.94
3,2600,Adventure [Atari],78.27
4,2600,Adventure [Sears],111.35
...,...,...,...
1552,PS,You Don't Know Jack: Mock 2,5.24
1553,PS,Yu-Gi-Oh! Forbidden Memories,15.19
1554,PS,Zero Divide,24.64
1555,PS,Zoboomafoo,3.71


In [154]:
game_price_list.to_csv("data/all_console_prices.csv")

### Use Excel to remove the comma for price values over 1k and bring back in CSV

In [164]:
rem_price_commas = pd.read_csv("data/all_console_prices.csv")

In [165]:
rem_price_commas["Price"] = rem_price_commas["Price"].astype(float)

In [166]:
rem_price_commas.dtypes

Unnamed: 0        int64
Unnamed: 0.1      int64
Console          object
Game Title       object
Price           float64
dtype: object

In [167]:
rem_price_commas.to_csv("data/all_console_prices.csv")