In [1]:
import pandas as pd
import requests
import json
import os
import sys
from IPython.display import clear_output
import time

from config import GJ_COOKIE 
# If this fails, it might be because you created the file after launching the notebook.
# In that case, you need to restart the kernel and try again.



# Scraping Gamejolt Supporters Data
This file allows you to get the data from your supporters page on Gamejolt. This works as of the `12th of May 2024` (front-end updated may break this).

You must be a **Gamejolt Creator** to use this script. This script will fetch the data from the API and save it to a CSV file.

## Step 1: GJ Authentication
Enter it in the `config.py` file as explained in the [README](README.md).

The code below checks if the authentication was entered correctly.

In [2]:
# Check if the GJ_COOKIE is set
if not GJ_COOKIE:
    print("GJ_COOKIE is not set. Please set the GJ_COOKIE environment variable in config.py")
    sys.exit(1)

## Step 2: Requesting and Saving the Raw Data
We call the following API endpoint to get the data: `https://gamejolt.com/site-api/mobile/dash/creators/supporters`

There are two parameters that we need to specify when calling the API:

- `perPage`: The number of supporters to get per request. We can only get 50 supporters per request.

- `pos`: The position of the first supporter to get. This is used as an offset.

We can get all the supporters by making multiple requests, increasing the `pos` parameter by `perPage` each time, until we get an empty response. On the frontend client, the po is simply the timestamp of the last supporter in the previous response. We can use this same logic to get all the supporters.

### Setting Timestamps: FROM and TO

Set the `DATE_FROM` and `DATE_TO` variables to specify the date range of the supporters to get. The date format is `YYYY-MM-DD` (it will be converted to a timestamp):
- `DATE_FROM`: The oldest day to start getting supporters from. Set to `None` to go until the beginning of time.
- `DATE_TO`: The newest day to get supporters from. Set to `None` to go until the current day.


Set any date to `None` to ignore it the date range.

__Note:__ Only the `DATE_TO` is deterministic (the search will start from there). `DATE_FROM` is used to halt the search (if ever reached).

In [3]:
# Set these variables to the desired date range:
DATE_FROM = None
DATE_TO = None

# Convert dates to EPOCH timestamps to milliseconds. If none provided, use the current date
if DATE_FROM:
    DATE_FROM = int(pd.to_datetime(DATE_FROM) * 1000)

if DATE_TO:
    DATE_TO = int(pd.to_datetime(DATE_TO) * 1000)
else:
    DATE_TO = int(pd.Timestamp.now().timestamp() * 1000)

print(f"DATE_FROM: {DATE_FROM}, DATE_TO: {DATE_TO}")

DATE_FROM: None, DATE_TO: 1715529346364


In [4]:
# If you want to update existing data that you have already downloaded, please provide the username
# to which you already have the data for, and the DATE_FROM will be set to the last timestamp in the data
EXISTING_USERNAME = "ddemkoo"

# Perform update
if EXISTING_USERNAME:
    file_path = f"data/{EXISTING_USERNAME}-supports.csv"
    if os.path.exists(file_path):
        prev_data = pd.read_csv(file_path)
        DATE_FROM = int(prev_data["timestamp"].max())
        print(DATE_FROM)
    else:
        print(f"Could not load existing data")

1715519236072


Computation below will do the scraping and save the data to a tmp JSON file as `data/USERNAME-supports.json`, where `USERNAME` is the username is of the account linked to the session cookie.

We save a list of the following data for each charged sticker:

- `id`: The ID of the charged sticker action (support action).

- `user_name`: The username of the supporter.

- `user_id`: The ID of the supporter user.

- `user_avatar`: The URL of the avatar of the supporter.

- `follower_count`: The number of followers the supporter has on Gamejolt.

- `timestamp`: The timestamp of when the user supported.

- `post_id`: The ID of the post the supporter supported.

We also save some data for the posts in a different table (to avoid redundancy), in a tmp file named `data/USERNAME-supports-posts.json` as a dictionary with the post ID as the key and the following data as the value:

- `slug`: The slug of the post.

- `like_count`: The number of likes the post has.

- `comment_count`: The number of comments the post has.

- `view_count`: The number of views the post has.

- `published_on`: The timestamp of when the post was published. 

- `lead_str`: The lead string of the post (title).

In [5]:
# URL for the POST request
url = 'https://gamejolt.com/site-api/mobile/dash/creators/supporters'

# Headers as the GameJolt client sends them
headers = {
    'Host': 'gamejolt.com',
    'User-Agent': 'Anonymous',
    'Accept': 'image/webp,*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/json',
    'Origin': 'https://gamejolt.com',
    'Connection': 'keep-alive',
    'Referer': 'https://gamejolt.com/dashboard/creator',
    'Cookie': f'frontend={GJ_COOKIE};',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-GPC': '1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'TE': 'trailers'
}

# Initialize the last_pos variable
last_pos = DATE_TO
resp_data = []
resp_posts_data = {} # id: {slug, likes, comments, views}

stats = {
    # Some stats for you :)
    "creator_name": "",
    "nb_reqs": 0,
    "start_time": pd.Timestamp.now(),
    "end_time": pd.Timestamp.now(),
    "success": False,
    "error": None
}

while True:
    # JSON body of the POST request
    data = {
        "_fields": {
            "actions": {
                "perPage": 50,
                "pos": last_pos
            }
        }
    }

    # Send request
    response = requests.post(url, headers=headers, json=data)
    stats["nb_reqs"] += 1

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Request failed with status code {response.status_code} at timestamp {last_pos}")
        stats["error"] = response.status_code
        break

    # Parse and save the data
    resp = response.json()
    if resp["payload"]["actions"]:

        # Check if actions is empty
        if not resp["payload"]["actions"]:
            stats["success"] = True
            stats["end_time"] = pd.Timestamp.now()
            break

        # Otherwise, log data
        pl = resp["payload"]["actions"]

        # For each action, append the data to the list
        for p in pl:
            resp_data.append({
                "id": p["id"],
                "user_name": p["user"]["username"],
                "user_id": p["user"]["id"],
                "user_avatar": p["user"]["img_avatar"],
                "follower_count": p["user"]["follower_count"],
                "timestamp": p["added_on"],
                "post_id": p["resource"]["id"] if p["resource_type"] == "Fireside_Post" and p["resource"] else None,
            })
            last_pos = p["added_on"]

            # Store post data if available
            if p["resource_type"] == "Fireside_Post" and p["resource"]:
                resource = p["resource"]
                # In case the post is deleted, not sure how API handles this
                resp_posts_data[resource["id"]] = {
                    "slug": resource["slug"] if resource["slug"] else None,
                    "like_count": resource["like_count"] if resource["like_count"] else 0,
                    "comment_count": resource["comment_count"] if resource["comment_count"] else 0,
                    "view_count": resource["view_count"] if resource["view_count"] else 0,
                    "published_on": resource["published_on"] if resource["published_on"] else None,
                    "lead_srt": resource["leadStr"] if resource["leadStr"] else None,
                }

            # Stats
            stats["end_time"] = pd.Timestamp.now()
            stats["creator_name"] = resp["user"]["username"]

            # Check if the last timestamp is less than the date_from
            if DATE_FROM and last_pos < DATE_FROM:
                stats["success"] = True
                break

        if stats["success"]:
            break

    else:
        # Could not find the data in the response
        stats["error"] = "Malformed response"
        break

    # Print progress
    clear_output(wait=True)
    print(f"Fetching for {stats['creator_name']}... Iteration {stats['nb_reqs']}, collected {len(resp_data)} charges\nTarget date: {DATE_FROM} - Currently at: {pd.to_datetime(last_pos, unit='ms')}")

    # Wait for a bit 
    time.sleep(0.5)


# Done
time_taken = pd.to_timedelta(stats["end_time"] - stats["start_time"]).total_seconds()
formatted_time = str(time_taken).split(".")[0]
print(f"\nDone ({formatted_time}s)! Collected {len(resp_data)} charges | From {DATE_FROM} to {str(pd.to_datetime(last_pos, unit='ms')).split('.')[0]}")

# Save the data
if not os.path.exists("data"):
    os.makedirs("data")

if not os.path.exists("data/tmp"):
    os.makedirs("data/tmp")

with open(f"data/tmp/{stats['creator_name']}-supports.json", "w") as f:
    json.dump(resp_data, f)

with open(f"data/tmp/{stats['creator_name']}-supports-posts.json", "w") as f:
    json.dump(resp_posts_data, f)



Done (1s)! Collected 2 charges | From 1715519236072 to 2024-05-12 12:30:17


## Step 3: Clean the data
We will finally clean the data and save it to a CSV file, so that it is lightweight and easy to use. We will also remove the temporary JSON files.

The CSV file will be saved as `data/USERNAME-supports.csv` and will contain the same columns as the JSON file.

The CSV file will be saved as `data/USERNAME-supports-posts.csv` and will contain the same columns as the JSON file, with the addition of the post_id as well.

In [6]:
# Import data
supports_df = pd.DataFrame(resp_data)
posts_df = pd.DataFrame(resp_posts_data).T.reset_index().rename(columns={"index": "post_id"})

# Convert columns
supports_df = supports_df.astype({'post_id': 'Int64'})

# Sanity checks
print(f"post_id is unique: {posts_df['post_id'].is_unique}")
print(f"All posts in supports_df are in posts_df: {supports_df['post_id'].isin(posts_df['post_id']).all()}")

display(supports_df.head())
print(f"Length of supports_df: {len(supports_df)}")
display(posts_df.head())
print(f"Length of posts_df: {len(posts_df)}")

# If username is set, import the existing data and append the new data
file_path_charges = f"data/{stats['creator_name']}-supports.csv"
file_path_posts = f"data/{stats['creator_name']}-supports-posts.csv"
if EXISTING_USERNAME and os.path.exists(file_path_charges) and os.path.exists(file_path_posts):
    existing_data = pd.read_csv(file_path_charges)
    existing_posts = pd.read_csv(file_path_posts)

    supports_df = pd.concat([supports_df, existing_data], ignore_index=True)
    posts_df = pd.concat([posts_df, existing_posts], ignore_index=True)

    # Drop duplicates
    supports_df = supports_df.drop_duplicates(subset="id")
    posts_df = posts_df.drop_duplicates(subset="post_id")

    # Print
    print(f"Scraped {len(resp_data)} new charges for {EXISTING_USERNAME}, new additions: {len(supports_df) - len(existing_data)}")
    print(f"Scraped {len(resp_posts_data)} new posts for {EXISTING_USERNAME}, new additions: {len(posts_df) - len(existing_posts)}")

# Save the data
supports_df.to_csv(f"data/{stats['creator_name']}-supports.csv", index=False)
posts_df.to_csv(f"data/{stats['creator_name']}-supports-posts.csv", index=False)

# Delete the tmp files
os.remove(f"data/tmp/{stats['creator_name']}-supports.json")
os.remove(f"data/tmp/{stats['creator_name']}-supports-posts.json")

post_id is unique: True
All posts in supports_df are in posts_df: True


Unnamed: 0,id,user_name,user_id,user_avatar,follower_count,timestamp,post_id
0,949157,Megamaster64,7141705,https://m.gjcdn.net/user-avatar/200/7141705-ll...,262,1715519236072,16858259
1,949079,Dramen_Lore,3310257,https://m.gjcdn.net/user-avatar/200/3310257-cr...,13,1715517017608,16858259


Length of supports_df: 2


Unnamed: 0,post_id,comment_count,lead_srt,like_count,published_on,slug,view_count
0,16858259,38,I'm working on an OFFICIAL website for PYRO-IL...,250,1715452062745,i-m-working-on-an-official-website-for-pyro-il...,3163


Length of posts_df: 1
Scraped 2 new charges for ddemkoo, new additions: 0
Scraped 1 new posts for ddemkoo, new additions: 0


Should be good, enjoy! :)