## Game Prices Scraper Concept
Notebook to test of webscraper to aquire video-games data. 

### Import dependencies

In [30]:
from bs4 import BeautifulStoneSoup as sp
import requests
import pandas as pd
from time import sleep

### Get 'all' games
Considering that Steam's API has rate limits and for the purpose of this learning application, it's advisable not to attempt querying all 300k games. Instead, for the purpose of diversity in genres, the provided array, generated with the assistance of ChatGPT, represents the top 10 most played games in 2021 across different genres. This selection ensures a diverse range of genres for effective learning and demonstration of the desired functionality.


In [31]:
top_100_games = [
    # Horror
    "Phasmophobia", "Resident Evil Village", "Dead by Daylight", "Amnesia: Rebirth", "Alien: Isolation",
    "Outlast 2", "The Forest", "Layers of Fear", "SOMA", "Little Nightmares",
  
    # Adventure
    "Grand Theft Auto V", "Red Dead Redemption 2", "The Witcher 3: Wild Hunt", "Assassin's Creed Odyssey",
    "Far Cry 5", "Tomb Raider (2013)", "Subnautica", "Control", "A Plague Tale: Innocence", "The Long Dark",
  
    # RPG
    "The Elder Scrolls V: Skyrim", "Fallout 4", "Dark Souls III", "Monster Hunter: World", "Divinity: Original Sin II",
    "Disco Elysium", "NieR: Automata", "Pillars of Eternity II: Deadfire", "Dragon Quest XI: Echoes of an Elusive Age",
    "Kingdom Come: Deliverance",
  
    # Roguelike
    "Risk of Rain 2", "Dead Cells", "Hades", "Enter the Gungeon", "Spelunky 2",
    "Slay the Spire", "Binding of Isaac: Rebirth", "Rogue Legacy", "UnderMine", "Children of Morta",
  
    # Strategy
    "Sid Meier's Civilization VI", "XCOM 2", "Total War: WARHAMMER II", "Crusader Kings III", "Age of Empires II: Definitive Edition",
    "Stellaris", "Company of Heroes 2", "Cities: Skylines", "Northgard", "They Are Billions",
  
    # Multiplayer
    "Dota 2", "Counter-Strike: Global Offensive", "PlayerUnknown's Battlegrounds", "Among Us", "Apex Legends",
    "Team Fortress 2", "Rust", "Warframe", "Rocket League", "Fall Guys: Ultimate Knockout",
  
    # Simulation
    "Euro Truck Simulator 2", "Microsoft Flight Simulator", "The Sims 4", "Stardew Valley", "Planet Zoo",
    "Cities: Skylines", "American Truck Simulator", "Farming Simulator 19", "Oxygen Not Included", "Kerbal Space Program",
  
    # Action
    "Grand Theft Auto V", "Tom Clancy's Rainbow Six Siege", "Apex Legends", "Call of Duty: Warzone", "DOOM Eternal",
    "Sekiro: Shadows Die Twice", "Control", "Metro Exodus", "Resident Evil 2 Remake", "Ghost of Tsushima",
  
    # Sports
    "FIFA 21", "NBA 2K21", "eFootball PES 2021", "Madden NFL 21", "F1 2021",
    "PGA TOUR 2K21", "Tony Hawk's Pro Skater 1 + 2", "WRC 9 FIA World Rally Championship", "UFC 4", "Tennis World Tour 2"
]


In [33]:
# Make a request to retrieve the list of games
response = requests.get('https://api.steampowered.com/ISteamApps/GetAppList/v2/')

games = []

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Retrieve the list of apps from the response
    app_list = data['applist']['apps']

    # Create a dictionary to store game names and app IDs
    game_dict = {}

    # Iterate over each app and store the game name and app ID in the dictionary
    for app in app_list:
        game_name = app['name']
        app_id = app['appid']
        
        if game_name == '' or app_id == '':
            continue

        # Remove demos
        if game_name.lower().split(' ')[-1] == 'demo':
            continue

        games.append({
            "name":game_name,
            "id":app_id
        })
else:
    print("Failed to retrieve the list of games.")


df_games = pd.DataFrame(games)
df_games = df_games[df_games['name'].isin(top_100_games)]
df_games.to_csv('steam_list.csv', sep=';', index=False)

### Get game details using Steam Api

In [34]:
def get_game_info(game_id):
    print(f'Checking: {game_id}')

    # Make request to game
    r = requests.get(f'https://store.steampowered.com/api/appdetails?appids={game_id}&l=english', timeout=30)
    
    if r.status_code == 429:
        print('Timeout - trying in 30 secs ') 
        sleep(30)

    # Content
    try:
        content = r.json()[str(game_id)]
        

        # Meta Data
        data = content['data']
        name = data['name']
        type_product = data['type']
        short_description = data['short_description']
        header_image = data['header_image']
        capsule_image = data['capsule_image']
        capsule_imagev5 = data['capsule_imagev5']
        genres = [x.get('description') for x in data['genres']]

        price_overview = data['price_overview']

        # Get release date
        try:
            if data['release_date']['coming_soon']:
                release_date = data['release_date']['data']
            else:
                release_date = "Coming Soon"
        except:
            release_date = None

        # Skip free games
        if not price_overview: 
            return False

        # Get game prices
        original_price = '{:,.2f}€'.format(price_overview['initial']/100)
        discount_percent = None
        discount_price = None
        if 'discount_percent' in price_overview:
            on_sale = True
            discount_percent = str(price_overview['discount_percent']) + "%"
            discount_price = '{:,.2f}€'.format(price_overview['final']/100)
        else:
            on_sale = False

        # Dict to get data
        game_info = {
            "game_id":game_id,
            "name":name,
            "release_date":release_date,
            "type_product":type_product,
            "description":short_description,
            'genres':genres,
            "header_image":header_image,
            "capsule_image":capsule_image,
            "capsule_imagev5":capsule_imagev5,
            "price":original_price,
            "on_sale":on_sale,
            'discount_percent':discount_percent,
            'discount_price':discount_price
        }

        return game_info
    except Exception as e:
        print(f'\t Couldnt parse {game_id}')
        return False

In [35]:
data = []
for index, row in df_games.iterrows():
    game_id = row['id']

    game_content = get_game_info(game_id)
    if not game_content:
        continue

    data.append(game_content)
    print(f'Total: {len(data)}')


Checking: 1267540
Total: 1
Checking: 1250410
Total: 2
Checking: 1239520
	 Couldnt parse 1239520
Checking: 1223910
Total: 3
Checking: 1174180
Total: 4
Checking: 1172470
	 Couldnt parse 1172470
Checking: 1158310
Total: 5
Checking: 1145360
Total: 6
Checking: 813780
Total: 7
Checking: 812140
Total: 8
Checking: 787860
Total: 9
Checking: 782330
Total: 10
Checking: 752590
Total: 11
Checking: 656350
Total: 12
Checking: 646570
Total: 13
Checking: 644930
Total: 14
Checking: 632360
Total: 15
Checking: 632470
Total: 16
Checking: 594570
Total: 17
Checking: 588650
Total: 18
Checking: 739630
Total: 19
Checking: 703080
Total: 20
Checking: 1016120
Total: 21
Checking: 999220
Total: 22
Checking: 381210
Total: 23
Checking: 379430
Total: 24
Checking: 377160
Total: 25
Checking: 362003
	 Couldnt parse 362003
Checking: 359550
Total: 26
Checking: 330020
Total: 27
Checking: 980030
	 Couldnt parse 980030
Checking: 945360
Total: 28
Checking: 311690
Total: 29
Checking: 292030
Total: 30
Checking: 289070
Total: 31
C

### Create Dataframe

In [43]:
df_completo = pd.DataFrame(data)

# Save to CSV
df_completo.to_csv('top_100_games_2021.csv')

df_completo.head(5)

Unnamed: 0,game_id,name,release_date,type_product,description,genres,header_image,capsule_image,capsule_imagev5,price,on_sale,discount_percent,discount_price
0,1267540,WRC 9 FIA World Rally Championship,Coming Soon,game,WRC is the leading off-road simulation franchi...,"[Racing, Simulation, Sports]",https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,29.99€,True,0%,29.99€
1,1250410,Microsoft Flight Simulator 40th Anniversary Ed...,Coming Soon,game,From gliders and helicopters to wide-body jets...,[Simulation],https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,69.99€,True,0%,69.99€
2,1223910,Tennis World Tour 2,Coming Soon,game,"Play as the world's top players, master each s...","[Simulation, Sports]",https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,39.99€,True,0%,39.99€
3,1174180,Red Dead Redemption 2,Coming Soon,game,Winner of over 175 Game of the Year Awards and...,"[Action, Adventure]",https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,59.99€,True,0%,59.99€
4,1158310,Crusader Kings III,Coming Soon,game,"Love, fight, scheme, and claim greatness. Dete...","[RPG, Simulation, Strategy]",https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,https://cdn.akamai.steamstatic.com/steam/apps/...,49.99€,True,0%,49.99€


### Retrieve different categories

There might be better and faster ways to do this.

In [41]:
categories = []
for idx, row in df_completo.iterrows():
    for categorie in row['genres']:
        if categorie not in categories:
            categories.append(categorie)

print(categories)

# Save categories in .csv
df_categories = pd.DataFrame(categories)
df_categories.to_csv('categories.csv')

['Racing', 'Simulation', 'Sports', 'Action', 'Adventure', 'RPG', 'Strategy', 'Indie', 'Early Access', 'Casual', 'Massively Multiplayer']


The list for categories is not as diverse as I thought it would be. But that's ok. Will add a categorie endpoint in webapp/admin to add categories that can be added to game manually :D