# This Notebook...
...queries one of the Steam API endpoints repeatedly, then formats the acquired data (steam game reviews) into a pandas dataframe, and then saves it in "data/data_raw.csv".

# Dependencies

In [77]:
import requests
import json

import os
from urllib.parse import quote
from dotenv import load_dotenv
load_dotenv()

import pandas as pd

# Query the API for data with a series of requests

In [78]:
app_id = "281990" # Stellaris  #"221001" # FTL

# Setup requests
data_list = []
cursor = "*" # Start cursor is "*"
I_want_to_keep_going = True
n_tot_reviews = 0 # Count number of loaded reviews

while I_want_to_keep_going and n_tot_reviews < 1000000:
    url = f"https://store.steampowered.com/appreviews/{app_id}?json=1&filter=updated&cursor={quote(cursor)}&review_type=all&purchase_type=all&num_per_page=100&language=english&day_range=365?"#key={os.environ["STEAM"]}"
    #url = "https://store.steampowered.com/appreviews/10" + cursor + "?json=1&filter=updated"
    response = requests.get(url)
    data = json.loads(response.content)

    # Get a query summary from first request
    if cursor == "*":
        print(data["query_summary"])
    
    # Print the success code ("1" is good)
    #print(data["success"])

    # Keep track of number of reviews
    n_tot_reviews += len(data["reviews"])

    # Handle potential issues
    try:
        if cursor == data["cursor"]: I_want_to_keep_going = False
        else:
            try:
                cursor = data["cursor"]
                data_list.append(data)
            except:
                break
    except:
        break

#params["cursor"] = data1["cursor"]

# cursor1 = data1["cursor"]
# response2 = requests.get("https://store.steampowered.com/appreviews/" + app_id + "&cursor=" + cursor1, params=params)
# data2 = json.loads(response2.content)
# print(data2["cursor"])


#data["reviews"][12]["review"] #12 is rev index

print("\nObtained " + str(sum([len(i["reviews"]) for i in data_list])) + " reviews.")
#print("\nLengths:")
#[print(len(i["reviews"])) for i in data_list]

#print("\nCursors:")
#[print(i["cursor"]) for i in data_list]#, data1["cursor"], " and Cursor 2: ", data2["cursor"], " Equal: ", data1["cursor"] == data2["cursor"])

#print("\nFirst reviews:")
#[print(i["reviews"][0]["review"] + "\n-----") for i in data_list] # data1["reviews"][0]["review"] == data2["reviews"][0]["review"])

{'num_reviews': 100, 'review_score': 8, 'review_score_desc': 'Very Positive', 'total_positive': 83923, 'total_negative': 11300, 'total_reviews': 95223}

Obtained 95251 reviews.


# Format the data into a dataframe

In [89]:
# Each element in data_list is a "package" of a couple of reviews.

# Maka a list of tuples representing our observations
data = []

for pack in data_list:
    for review in pack["reviews"]:
        # If the dev has responded, two additional "columns"/keys are included. We skip them because they are very few.
        if 'timestamp_dev_responded' in review.keys() and 'developer_response' in review.keys():
            review.pop('timestamp_dev_responded')
            review.pop('developer_response')
        #data.append(review.values())
        
        # Extract the review, the voted_up bool, and the playtime at the time of the review
        try:
            data.append([review["review"], review["voted_up"], review["author"]["playtime_at_review"], review["weighted_vote_score"]])
        except:
            continue

# Make a list of the keys present for each individual review
#colnames = list(data_list[1]["reviews"][0].keys())
colnames = ["review", "voted_up", "playtime", "helpfulness"]

df = pd.DataFrame(data, columns=colnames)

# Perform some simple/quick preprocessing

In [90]:
# Add a column for the length of each review (in characters)
df["length"] = [len(rev) for rev in df["review"]]

# Standardise length
df["length"] = df["length"]/max(df["length"])

# Standardise playtime
df["playtime"] = df["playtime"]/max(df["playtime"])

# Filter away reviews with zero weighted_vote_score ("helpfulness")
df = df[df["helpfulness"] != 0]

# Save the raw data

In [94]:
df.to_csv("data_raw.csv") # Overwrites

In [93]:
print("Finished querying data!")

Finished querying data!
