I have reason to believe we are limited to 1.5 requests/second, which puts all data collection at 12.4262962963 total hours. That's a lot, but I think it is doable. If not we can always take a random sample. Also I obtained the apps list in a new way shown in the png. This should mean that only games are included, and no DLC either, which trims down the list by a lot.

In [1]:
import requests
import pandas as pd
import numpy as np
import pickle
import time
import random

Get key (not needed but might help to not get rate limited)

In [2]:
key = open('key.txt').read()

Load apps list from pickle file (stores ids as integers)

In [3]:
with open('all_apps.pkl', 'rb') as fp:
    all_apps = pickle.load(fp)

Groups apps into smaller segments (20 seems like a good number)

In [4]:
app_chunks = np.array_split(all_apps, 20) # 20 is probably a reasonable number

Set the chunk number you want to collect here (should be in range [0,19] )

In [5]:
CHUNK_NUM = 2

Collection stuff below

In [6]:
categories = [
    ['name'],
    ['type'],
    ['steam_appid'],
    ['developers'],
    ['publishers'],
    ['is_free'],
    ['price_overview', 'initial'],
    ['achievements', 'total'],
    ['release_date', 'date'],
    ['metacritic', 'score']
]

filters = ''
for cat in categories:
    filters += cat[0] + ','
filters += 'basic,genres,categories'

In [7]:
def fetch(game_info, categories):
    info = {}
    for category in categories:
        cur_info = game_info
        failed = False
        
        for subpart in category:
            if not failed and subpart in cur_info.keys():
                cur_info = cur_info[subpart]
            else:
                failed = True
        
        if not failed:
            info[' '.join(category)] = cur_info
        else:
            info[' '.join(category)] = np.NaN
    
    if 'genres' in game_info.keys():
        for genre in game_info['genres']:
            info['Genre: ' + genre['description']] = True
    if 'categories' in game_info.keys():
        for genre in game_info['categories']:
            info['Category: ' + genre['description']] = True
    
    
    return info

API query helper function (inspried by https://medium.com/clover-platform-blog/conquering-api-rate-limiting-dcac5552714d)

In [8]:
def store_query(app_id):
    global filters, key
    
    max_attempts = 11
    attempts = 0
    
    URL = f"http://store.steampowered.com/api/appdetails/?appids={app_id}"
    PARAMS = {'filters': filters, 'l': 'en', 'key': key} # probably don't actually need the key but maybe it will help
    
    while attempts < max_attempts:
        re = requests.get(url=URL, params=PARAMS)

        if re.status_code != 429:
            break
            
        print(f"waiting on store query for {app_id} ... (attempt {attempts}/{max_attempts})")
        #time.sleep((2 ** attempts) + random.random())
        time.sleep(30 + random.random())
        attempts = attempts + 1
        
    if attempts >= max_attempts:
        print(f"ERROR: Store query timeout on {app_id}. Copy this entire message into failedIDs.text")
        return None
    else:
        return re
    
def review_query(app_id):
    max_attempts = 11
    attempts = 0
    
    URL = f"http://store.steampowered.com/appreviews/{app_id}?json=1"
    PARAMS = {'num_per_page': 0, 'language': 'all', 'purchase_type': 'all'}
    
    while attempts < max_attempts:
        re = requests.get(url=URL, params=PARAMS)

        if re.status_code != 429:
            break
            
        print(f"waiting on review query for {app_id} ... (attempt {attempts}/{max_attempts})")
        #time.sleep((2 ** attempts) + random.random())
        time.sleep(30 + random.random())
        attempts = attempts + 1
        
    if attempts >= max_attempts:
        print(f"ERROR: Review query timeout on {app_id}. Copy this entire message into failedIDs.text")
        return None
    else:
        return re

Getting all info for chunk

In [9]:
all_info = []

my_chunk = app_chunks[CHUNK_NUM]

all_info = []

progress_num = 100
num_done = 0

for app_id in my_chunk:
    if num_done % progress_num == 0:
        print(f"Finished\t{num_done} / {len(my_chunk)}")
        
    # make main api query
    re = store_query(app_id)
    
    if re is not None and re.json()[str(app_id)]['success']:
        game_info = re.json()[str(app_id)]['data']
        info = fetch(game_info, categories)
        # make review api query
        re = review_query(app_id)
        if re is not None:
            summary = re.json()
            if 'query_summary' in summary.keys():
                q_summary = summary['query_summary']

                if 'total_positive' in q_summary:
                    info['total_positive'] = q_summary['total_positive']
                else:
                    info['total_positive'] = np.NaN

                if 'total_reviews' in q_summary:
                    info['total_reviews'] = q_summary['total_reviews']
                else:
                    info['total_reviews'] = np.NaN


        all_info.append(info)
    
    num_done += 1

Finished	0 / 3355
Finished	100 / 3355
Finished	200 / 3355
waiting on store query for 398250 ... (attempt 0/11)
waiting on store query for 398250 ... (attempt 1/11)
waiting on store query for 398250 ... (attempt 2/11)
waiting on store query for 398250 ... (attempt 3/11)
waiting on store query for 398250 ... (attempt 4/11)
waiting on store query for 398250 ... (attempt 5/11)
Finished	300 / 3355
Finished	400 / 3355
waiting on store query for 404420 ... (attempt 0/11)
waiting on store query for 404420 ... (attempt 1/11)
waiting on store query for 404420 ... (attempt 2/11)
waiting on store query for 404420 ... (attempt 3/11)
waiting on store query for 404420 ... (attempt 4/11)
Finished	500 / 3355
Finished	600 / 3355
waiting on store query for 410470 ... (attempt 0/11)
waiting on store query for 410470 ... (attempt 1/11)
waiting on store query for 410470 ... (attempt 2/11)
waiting on store query for 410470 ... (attempt 3/11)
waiting on store query for 410470 ... (attempt 4/11)
waiting on sto

In [10]:
df = pd.DataFrame.from_dict(all_info)
print(f"Length: {len(df)}")
df.head()

Length: 3354


Unnamed: 0,name,type,steam_appid,developers,publishers,is_free,price_overview initial,achievements total,release_date date,metacritic score,...,Genre: Massively Multiplayer,Genre: Violent,Category: Commentary available,Category: Steam Turn Notifications,Genre: Nudity,Genre: Gore,Genre: Sexual Content,Category: SteamVR Collectibles,Category: Includes Source SDK,Genre: Movie
0,Egyptian Senet,game,391580,[Ezzat Studios],[Ezzat Studios],False,399.0,,"Aug 20, 2015",,...,,,,,,,,,,
1,Dream Chamber,game,391590,[Forge Reply],[Microids Indie],False,999.0,,"Jul 31, 2015",,...,,,,,,,,,,
2,King Lucas,game,391600,[DevilishGames],[DevilishGames],False,499.0,20.0,"Dec 1, 2016",,...,,,,,,,,,,
3,Stage Presence,game,391640,[Sea Green Games],[tinyBuild],False,999.0,10.0,"Feb 28, 2017",,...,,,,,,,,,,
4,Escape From BioStation,game,391650,[Tusky Games],[Tusky Games],False,399.0,11.0,"Jul 3, 2017",,...,,,,,,,,,,


In [11]:
df.to_pickle(f"df{CHUNK_NUM}.pkl")

Once all chunks are created, the complete dataframe can be created with `df = pd.concat([df0, df1, df2, ... , df19], ignore_index=True, sort=False)`