In [None]:
import requests
import pandas as pd
import numpy as np
import pickle
import time
import random

Get key (not needed but might help to not get rate limited)

In [None]:
key = open('key.txt').read()

Load apps list from pickle file (stores ids as integers)

In [None]:
with open('all_apps.pkl', 'rb') as fp:
    all_apps = pickle.load(fp)

Groups apps into smaller segments (20 seems like a good number)

In [None]:
app_chunks = np.array_split(all_apps, 20) # 20 is probably a reasonable number

Set the chunk range you want to collect here (each should be in range [0,19] ) (if you only want to do 1, set both to the same number)

In [None]:
START_NUM = 5
END_NUM = 5

Collection stuff below

In [None]:
categories = [
    ['name'],
    ['type'],
    ['steam_appid'],
    ['developers'],
    ['publishers'],
    ['is_free'],
    ['price_overview', 'initial'],
    ['achievements', 'total'],
    ['release_date', 'date'],
    ['metacritic', 'score']
]

filters = ''
for cat in categories:
    filters += cat[0] + ','
filters += 'basic,genres,categories'

In [None]:
def fetch(game_info, categories):
    info = {}
    for category in categories:
        cur_info = game_info
        failed = False
        
        for subpart in category:
            if not failed and subpart in cur_info.keys():
                cur_info = cur_info[subpart]
            else:
                failed = True
        
        if not failed:
            info[' '.join(category)] = cur_info
        else:
            info[' '.join(category)] = np.NaN
    
    if 'genres' in game_info.keys():
        for genre in game_info['genres']:
            info['Genre: ' + genre['description']] = True
    if 'categories' in game_info.keys():
        for genre in game_info['categories']:
            info['Category: ' + genre['description']] = True
    
    
    return info

API query helper function (inspried by https://medium.com/clover-platform-blog/conquering-api-rate-limiting-dcac5552714d)

In [None]:
def store_query(app_id):
    global filters, key
    
    max_attempts = 11
    attempts = 0
    
    URL = f"http://store.steampowered.com/api/appdetails/?appids={app_id}"
    PARAMS = {'filters': filters, 'l': 'en', 'key': key} # probably don't actually need the key but maybe it will help
    
    while attempts < max_attempts:
        re = requests.get(url=URL, params=PARAMS)

        if re.status_code != 429:
            break
            
        #print(f"waiting on store query for {app_id} ... (attempt {attempts}/{max_attempts})")
        #time.sleep((2 ** attempts) + random.random())
        time.sleep(30 + random.random())
        attempts = attempts + 1
        
    if attempts >= max_attempts:
        print(f"ERROR: Store query timeout on {app_id}. Copy this entire message into failedIDs.txt")
        return None
    else:
        return re
    
def review_query(app_id):
    max_attempts = 11
    attempts = 0
    
    URL = f"http://store.steampowered.com/appreviews/{app_id}?json=1"
    PARAMS = {'num_per_page': 0, 'language': 'all', 'purchase_type': 'all'}
    
    while attempts < max_attempts:
        re = requests.get(url=URL, params=PARAMS)

        if re.status_code != 429:
            break
            
        #print(f"waiting on review query for {app_id} ... (attempt {attempts}/{max_attempts})")
        #time.sleep((2 ** attempts) + random.random())
        time.sleep(30 + random.random())
        attempts = attempts + 1
        
    if attempts >= max_attempts:
        print(f"ERROR: Review query timeout on {app_id}. Copy this entire message into failedIDs.txt")
        return None
    else:
        return re

Getting all info for chunk

In [None]:
for CHUNK_NUM in range(START_NUM, END_NUM+1):
    print(f"Starting df{CHUNK_NUM}")
    all_info = []

    my_chunk = app_chunks[CHUNK_NUM]

    all_info = []

    progress_num = 100
    num_done = 0

    for app_id in my_chunk:
        if num_done % progress_num == 0:
            print(f"Finished\t{num_done} / {len(my_chunk)}")

        # make main api query
        re = store_query(app_id)

        try:
            if re is not None and re.json()[str(app_id)]['success']:
                game_info = re.json()[str(app_id)]['data']
                info = fetch(game_info, categories)
                # make review api query
                re = review_query(app_id)
                if re is not None:
                    summary = re.json()
                    if 'query_summary' in summary.keys():
                        q_summary = summary['query_summary']

                        if 'total_positive' in q_summary:
                            info['total_positive'] = q_summary['total_positive']
                        else:
                            info['total_positive'] = np.NaN

                        if 'total_reviews' in q_summary:
                            info['total_reviews'] = q_summary['total_reviews']
                        else:
                            info['total_reviews'] = np.NaN


                all_info.append(info)
        except:
            print(f"ERROR: Epic fail on {app_id}. Copy this entire message into failedIDs.txt")


        num_done += 1

    df = pd.DataFrame.from_dict(all_info)
    df.to_pickle(f"df{CHUNK_NUM}.pkl")
    print(f"Finished df{CHUNK_NUM}!")

Once all chunks are created, the complete dataframe can be created with `df = pd.concat([df0, df1, df2, ... , df19], ignore_index=True, sort=False)`