### Testing all System Components + API outputs


In [86]:
import httpx
import time
from loguru import logger

MIRRORS = [
    "https://api.pullpush.io/reddit/search/submission/",
    "https://elastic.pushshift.io/reddit/search/submission/",
    "http://api.pushshift.io/reddit/search/submission/"
]

# This seems to be down and not up
# MIRRORS_COMMENT = [
#     "https://api.pullpush.io/reddit/search/comment/",
#     "https://elastic.pushshift.io/reddit/search/comment/",
#     "http://api.pushshift.io/reddit/search/comment/"
# ]

MIRRORS_COMMENT_IDS = [
    "https://api.pullpush.io/reddit/submission/comment_ids/",
    "https://elastic.pushshift.io/submission/comment_ids/",
    "http://api.pushshift.io/reddit/submission/comment_ids/"
]

MIRRORS_COMMENT = [
    "https://api.pullpush.io/reddit/search/comment/",
    "https://elastic.pushshift.io/reddit/search/comment/",
    "http://api.pushshift.io/reddit/search/comment/"
]

In [85]:
def try_fetch(url, params):
    try:
        with httpx.Client(follow_redirects=True, timeout=12) as client:
            resp = client.get(url, params=params)
            resp.raise_for_status()
            return resp.json().get("data", [])
    except Exception as e:
        logger.warning(f"{url} failed: {e}")
        return None

def fetch_submissions(subreddit: str, after: int, before: int, size: int = 100, retries: int = 3):
    params = {
        "subreddit": subreddit,
        "size": size,
        "sort": "asc",
        "sort_type": "created_utc",
        "after": after,
        "before": before
    }

    for attempt in range(1, retries + 1):
        logger.info(f"Attempt {attempt}/{retries} to fetch r/{subreddit}")
        for mirror in MIRRORS:
            data = try_fetch(mirror, params) 
            if data:
                logger.success(f"‚úÖ Success via {mirror}")
                return data

        logger.error("All mirrors failed. Retrying‚Ä¶")
        time.sleep(3)

    logger.critical("‚ùå Pushshift Submissions URL offline for this time range")
    return []

# Best path left is to fetch commment ids for our filtered submission ids
def fetch_comment_ids(submission_id: str, retries: int = 3):
    for attempt in range(1, retries + 1):
        logger.info(f"Attempt {attempt}/{retries} to fetch comment IDs for submission: {submission_id}")
        
        for mirror in MIRRORS_COMMENT_IDS:
            url = f"{mirror}{submission_id}"
            data = try_fetch(url, params=None)
            
            if data:
                logger.success(f"‚úÖ Found {len(data)} comment IDs via {mirror}")
                return data
        
        logger.error("All mirrors failed. Retrying‚Ä¶")
        time.sleep(2)

    logger.critical("‚ùå Failed to fetch comment IDs.")
    return []

# Pass the searched ids to retrieve all comment objects
def fetch_comments_by_ids(comment_ids: list[str], batch_size: int = 100, retries: int = 3):
    results = []

    for i in range(0, len(comment_ids), batch_size):
        batch = comment_ids[i:i + batch_size]
        params = {"ids": ",".join(batch)}

        for attempt in range(1, retries + 1):
            logger.info(f"Attempt {attempt}/{retries} to fetch comments by comment ids")
            logger.info(f"Fetching comment batch {i//batch_size+1} (size {len(batch)})‚Ä¶")

            for mirror in MIRRORS_COMMENT:
                comments = try_fetch(mirror, params)
                if comments:
                    logger.success(f"‚úÖ Batch {i//batch_size+1}: {len(comments)} comments via {mirror}")
                    results.extend(comments)
                    break
            else:
                logger.error("All mirrors failed. Retrying batch‚Ä¶")
                time.sleep(2)
                continue  # retry attempt
            
            break  # success ‚Üí proceed to next batch

    logger.success(f"üéØ Total comments fetched: {len(results)}")
    return results

In [None]:
from datetime import datetime

after = int(datetime(2020, 1, 1).timestamp())
before = int(datetime(2020, 6, 1).timestamp())
posts = fetch_submissions("Workout", after, before)

[32m2025-10-26 17:58:27.567[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_submissions[0m:[36m22[0m - [1mAttempt 1/3 to fetch r/Workout[0m
[32m2025-10-26 17:58:29.727[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mfetch_submissions[0m:[36m26[0m - [32m[1m‚úÖ Success via https://api.pullpush.io/reddit/search/submission/[0m


Constructing a custom POST Meta data for detailed pull through


In [79]:
import json

post_meta_data = []

for p in posts:
    title = p.get("title", "")
    selftext = p.get("selftext", "")
    num_comments = p.get("num_comments", 0)
    is_self = p.get("is_self", False)

    # Filter rules:
    # Post's that are not a text post AND no discussion they are pretty much trash, cause im sure it's a dude screaming for discounts
    # Or someone promotting something
    if (not is_self) and num_comments == 0:
        continue

    # I am doubtful on this, I flagged it as weak discussion cause I genuinely dont thing there might be much to debate
    # on the off chance might have something insane hidding thing
    if selftext in ("", "[removed]", "[deleted]") and num_comments < 3:
        continue

    # Appending Posts
    post_meta_data.append({
        "id": p.get("id"),
        "subreddit": p.get("subreddit"),
        "title": title,
        "selftext": selftext,
        "comment_count": num_comments,
        "score": p.get("score", 0),
        "total_awards_received": p.get("total_awards_received", 0),
        "is_self": is_self,
        "over_18": p.get("over_18", False),
        "permalink": p.get("permalink")
    })

print(f"Filtered posts: {len(post_meta_data)} / {len(posts)}")
for meta in post_meta_data:
    print(json.dumps(meta, indent=4))
# print(json.dumps(posts, indent=5))

Filtered posts: 48 / 100
{
    "id": "eijruc",
    "subreddit": "workout",
    "title": "Starting to do pushups",
    "selftext": "I'm a semi-serious cyclist so my legs have plenty of working out to do but my upper body strength is a mediocre joke. \n\nI cannot currently do a full pushup, sure I can keep proper form all the way down and hold it for a bit but there's no chance of me coming up again. \nI'd like to change that, so:\n\nShould I start doing full pushups on inclined surfaces gradually lowering them as I get stronger, or, should I attempt full pushups on horizontal ground but only go as far down as I can get back up and then gradually go lower as I get stronger.\n\nAny help appreciated, thanks.",
    "comment_count": 11,
    "score": 21,
    "total_awards_received": 0,
    "is_self": true,
    "over_18": false,
    "permalink": "/r/workout/comments/eijruc/starting_to_do_pushups/"
}
{
    "id": "eim3u3",
    "subreddit": "workout",
    "title": "Workouts with little to no rest

Creating and fetching comments on quality basises from filtered posts


In [87]:
submission_comments = {}
# Testing with batch 3 will build final pipeline later
for p in posts[:3]:
    sid = p["id"]
    submission_key = f"t3_{sid}"

    print(f"\nüìå Processing {submission_key} ‚Äî {p['title']}")

    comment_ids = fetch_comment_ids(sid)
    if not comment_ids:
        print("‚ö†Ô∏è No comment IDs found")
        continue

    comments = fetch_comments_by_ids(comment_ids)

    submission_comments[submission_key] = {
        "submission_id": sid,
        "title": p["title"],
        "selftext": p["selftext"],
        "comment_count_est": len(comment_ids),
        "comments": comments  # list of Pushshift comment objects
    }

print("\n‚úÖ Done fetching test batch")


[32m2025-10-27 19:37:06.259[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m38[0m - [1mAttempt 1/3 to fetch comment IDs for submission: eii8d2[0m



üìå Processing t3_eii8d2 ‚Äî This Is Why Your Muscles Are Shaking During Your Workout


For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
[32m2025-10-27 19:37:07.924[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m48[0m - [31m[1mAll mirrors failed. Retrying‚Ä¶[0m
[32m2025-10-27 19:37:09.927[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m38[0m - [1mAttempt 2/3 to fetch comment IDs for submission: eii8d2[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
[32m2025-10-27 19:37:11.124[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m48[0m - [31m[1mAll mirrors failed. Retrying‚Ä¶[0m
[32m2025-10-27 19:37:13.127[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m38[0m - [1mAttempt 

‚ö†Ô∏è No comment IDs found

üìå Processing t3_eiijkw ‚Äî This Is Why Fitness Is Important To Our Health, But It‚Äôs Critical For Your Business.


For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
[32m2025-10-27 19:37:17.528[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m48[0m - [31m[1mAll mirrors failed. Retrying‚Ä¶[0m
[32m2025-10-27 19:37:19.531[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m38[0m - [1mAttempt 2/3 to fetch comment IDs for submission: eiijkw[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404[0m
[32m2025-10-27 19:37:20.833[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m48[0m - [31m[1mAll mirrors failed. Retrying‚Ä¶[0m
[32m2025-10-27 19:37:22.842[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_comment_ids[0m:[36m38[0m - [1mAttempt 

KeyboardInterrupt: 

In [77]:
for key, comments in submission_comments.items():
    print(f"\nüîπ {key} - Total comments: {len(comments)}")
    for c in comments:
        print(json.dumps(c, indent=2))


üîπ t3_eii8d2 - Total comments: 10
{
  "archived": true,
  "author": "[deleted]",
  "author_flair_css_class": null,
  "author_flair_text": null,
  "body": "[deleted]",
  "controversiality": 0,
  "created_utc": "1266980694",
  "distinguished": null,
  "downs": 0,
  "edited": false,
  "gilded": 0,
  "id": "c0l3cc5",
  "link_id": "t3_b5q4d",
  "name": "t1_c0l3cc5",
  "parent_id": "t3_b5q4d",
  "retrieved_on": 1426223656,
  "score": 1,
  "score_hidden": false,
  "subreddit": "workout",
  "subreddit_id": "t5_2rir8",
  "ups": 1
}
{
  "archived": true,
  "author": "biovn",
  "author_flair_css_class": null,
  "author_flair_text": null,
  "body": "Good site about workout ",
  "controversiality": 0,
  "created_utc": "1319696374",
  "distinguished": null,
  "downs": 0,
  "edited": false,
  "gilded": 0,
  "id": "c2utwgk",
  "link_id": "t3_lqp4h",
  "name": "t1_c2utwgk",
  "parent_id": "t3_lqp4h",
  "retrieved_on": 1427789623,
  "score": 1,
  "score_hidden": false,
  "subreddit": "workout",
  "su