# Web-scraping for Video Game Prices

In [None]:
# Dependancies
import pandas as pd
import lxml.html as lh
from bs4 import BeautifulSoup as bs
import requests

In [None]:
# URLs to scrape
base_url = "https://gamevaluenow.com/"
console = ["atari-2600",
            "nintendo-nes",
            "sega-genesis",
            "super-nintendo",
            "nintendo-64",
            "sega-cd",
            "sega-saturn",
            "playstation-1-ps1"]
console_col = ["2600",
                "NES",
                "GEN",
                "SNES",
                "N64",
                "SCD",
                "SAT",
                "PS"]

In [168]:
# Put the all the console complete prices data in a list
complete_list = []

for name in range(len(console)):
    all_prices = []
    
    # Retrieve page with the requests module
    response = requests.get(base_url + console[name])
    # Create a Beautiful Soup object
    soup = bs(response.text, 'html.parser')
    
    prices_table = soup.find("table")
    prices_data = prices_table.find_all("tr")
    
    # Get all the price data
    for item in range(len(prices_data)):
        for td in prices_data[item].find_all("td"):
            # Remove all the markup from the text
            all_prices.append(td.text.strip())
        
        all_prices.append(console_col[name])
        # Make a list of the item names from every fifth index eg 1,6,10 et
        game_title = all_prices[1::5]             
        # Make a list of the complete price from starting at the fourth index
        price_complete = all_prices[3::5]
        # Make a list of the console types from every fifth index eg 0,5,9 etc
        console_name = all_prices[5::5] 
        # Make the lists in to a datframe
        game_prices_df = pd.DataFrame({'Game Title' : game_title, 'Console' : console_name, 'Price' : price_complete})
    
    # Create a list of data frames
    complete_list.append(game_prices_df)

In [172]:
# Concatinate the list of data frames in to one
game_price_list = pd.concat(complete_list)
game_price_list.reset_index(drop=True)

Unnamed: 0,Game Title,Console,Price
0,3-D Tic-Tac-Toe [Atari],2600,23.31
1,3-D Tic-Tac-Toe [Sears],2600,83.88
2,The Activision Decathlon,2600,23.94
3,Adventure [Atari],2600,79.72
4,Adventure [Sears],2600,111.35
...,...,...,...
5406,You Don't Know Jack: Mock 2,PS,5.24
5407,Yu-Gi-Oh! Forbidden Memories,PS,16.52
5408,Zero Divide,PS,24.64
5409,Zoboomafoo,PS,3.71


In [173]:
game_price_list.to_csv("data/all_console_prices.csv")

### Use Excel to remove the comma for price values over 1k and bring back in CSV

In [164]:
rem_price_commas = pd.read_csv("data/all_console_prices.csv")

In [165]:
rem_price_commas["Price"] = rem_price_commas["Price"].astype(float)

In [166]:
rem_price_commas.dtypes

Unnamed: 0        int64
Unnamed: 0.1      int64
Console          object
Game Title       object
Price           float64
dtype: object

In [167]:
rem_price_commas.to_csv("data/all_console_prices.csv")