### V2 REDDIT EXTRACTION + KNOWLEDGE BASE PREPARATION


In [4]:
import os
from dotenv import load_dotenv

load_dotenv()  # reads .env if present

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT") or "knowledge-collection-service/0.1"

Scoring Functions


In [None]:
import re
import math
from typing import List

# --- Heuristics - Regex---
STEP_PAT = re.compile(r"^(\s*[-*\d\.)]+\s+|\b(try|consider|start|first|next|then|finally|should)\b)", re.I)

def advice_like(text: str) -> bool:
    if not text: return False
    t = text.strip()
    if len(t) < 30:  # too short to be useful advice
        return False
    # contains bullets, numbers, or directive verbs
    return bool(STEP_PAT.search(t))

#Not implemented - should be discussed!!!
# -- WILL MOVE TO HAVING LLM GENERATE ADVICE, VIA STEPS [PROVIDED WITH CONTEXT]
def extract_steps(text: str) -> List[str]:
    # crude split by lines that look like bullet/numbered steps
    steps = []
    for line in text.splitlines():
        lt = line.strip()
        if len(lt) >= 4 and (lt.startswith("-") or lt.startswith("*") or re.match(r"^\d+\.|^\d+\)", lt)):
            steps.append(lt.lstrip("-* ").strip())
    # fallback: look for sentences with directive verbs
    if not steps:
        sents = re.split(r"(?<=[.!?])\s+", text)
        for s in sents:
            if advice_like(s):
                steps.append(s.strip())
    # keep unique-ish
    seen = set()
    uniq = []
    for s in steps:
        k = s.lower()
        if k not in seen:
            seen.add(k)
            uniq.append(s)
    return uniq[:10]

def score_comment(score: int, num_replies: int, awards: int, length: int) -> float:
    # Normalize-ish by simple log scaling and weights
    s = math.log1p(max(score, 0)) * 0.6
    r = math.log1p(max(num_replies, 0)) * 0.3
    a = math.log1p(max(awards, 0)) * 0.1
    L = 0.0
    if 60 <= length <= 1200:
        # reward reasonable length
        L = 0.2
    return s + r + a + L

def score_submission(score: int, num_comments: int, upvote_ratio: float) -> float:
    s = math.log1p(max(score, 0)) * 0.5
    c = math.log1p(max(num_comments, 0)) * 0.3
    u = (upvote_ratio or 0.5) * 0.2
    return s + c + u

Extraction


In [15]:
import praw
import uuid

if not (REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET and REDDIT_USER_AGENT):
        raise SystemExit("Missing Reddit creds. Fill .env first (REDDIT_CLIENT_ID/SECRET/USER_AGENT).")

reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT,
)


LIMIT = 20  # per subreddit
submissions = []
seen_ids = set()

# Fetching for time-insensitive domains
domains = [
    {
        "domain_id": uuid.uuid4(),
        "domain_tag": "FAC",
        "domain_name": "Family and Caregiving",
        "subreddits": [
            "family",
            "relationship_advice",
            "Parenting",
            "caregiving",
        ],
    },
    {
        "domain_id": uuid.uuid4(),
        "domain_tag": "PH",
        "domain_name": "Physical Health",
        "subreddits": [
            "fitness",
            "loseit",
            "nutrition",
            "xxfitness",
        ],
    },
    {
        "domain_id": uuid.uuid4(),
        "domain_tag": "MH",
        "domain_name": "Mental Health",
        "subreddits": [
            "mentalhealth",
            "GetDisciplined",
            "DecidingToBeBetter",
            "adhdwomen",
            "Anxietyhelp",
        ],
    },
]

# KB Will be built without queries to avoid bias in the extracted dataset

In [16]:
# Building my submissions object
from loguru import logger

logger.info("üöÄ Collecting submissions across domains...")
for domain in domains:
        logger.info(f"Fetching Top Submissions from Subreddits for Domain --> [{domain['domain_tag']}: {domain['domain_name']}]")
        for sub_name in domain["subreddits"]:
            subreddit = reddit.subreddit(sub_name)
            logger.info(f"  ‚Ü≥ Fetching from r/{sub_name}")

            try:
                for submission in subreddit.top(limit=LIMIT):

                    # Skip duplicates
                    if submission.id in seen_ids:
                        continue
                    seen_ids.add(submission.id)

                    # Filter useless posts
                    if submission.num_comments < 5:
                        continue

                    # Skipping empty placeholders, mostly ads or promotions atimes
                    if (not submission.selftext or submission.selftext in ["[removed]", "[deleted]"]) \
                            and submission.num_comments == 0:
                        continue

                    submissions.append({
                        "submission_id": submission.id,
                        "domain_tag": domain["domain_tag"],
                        "domain_name": domain["domain_name"],
                        "subreddit": sub_name,
                        "title": submission.title,
                        "selftext": submission.selftext,
                        "score": submission.score,
                        "upvote_ratio": submission.upvote_ratio,
                        "comment_count": submission.num_comments,
                        "permalink": f"https://www.reddit.com{submission.permalink}",
                        "created_utc": submission.created_utc,
                    })
                
            except Exception as e:
                logger.error(f"Error loading r/{sub_name}: {e}")


[32m2025-10-29 10:02:33.862[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1müöÄ Collecting submissions across domains...[0m
[32m2025-10-29 10:02:33.864[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mFetching Top Submissions from Subreddits for Domain --> [FAC: Family and Caregiving][0m
[32m2025-10-29 10:02:33.864[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1m  ‚Ü≥ Fetching from r/family[0m
[32m2025-10-29 10:02:36.783[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1m  ‚Ü≥ Fetching from r/relationship_advice[0m
[32m2025-10-29 10:02:37.427[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1m  ‚Ü≥ Fetching from r/Parenting[0m
[32m2025-10-29 10:02:37.956[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1m  ‚Ü≥ Fetching from r/caregiving[0m
[32m2025-10-29 10:02:38.402[0m | [1mINFO    [0m | [36m__main__

Exploring what we got


In [23]:
import json
from collections import Counter

print(f"Total submissions: {len(submissions)}")
# domain distribution
domain_counts = Counter([s["domain_tag"] for s in submissions])
print("\nBy Domain:", dict(domain_counts))
# subreddit distribution
sub_counts = Counter([s["subreddit"] for s in submissions])
print("\nBy Subreddit:", dict(sub_counts))
# average comment count
avg_comments = sum(s["comment_count"] for s in submissions) / len(submissions)
print(f"\nAvg comments per post: {avg_comments:.2f}")

# Basic checkout
count = 0
for submission in submissions:
    if submission['domain_tag'] == 'PH' and submission['subreddit'] == 'loseit':
        print(json.dumps(submission, indent=5))
        count += 1
        if count >= 10:
            break


Total submissions: 254

By Domain: {'FAC': 74, 'PH': 80, 'MH': 100}

By Subreddit: {'family': 20, 'relationship_advice': 20, 'Parenting': 20, 'caregiving': 14, 'fitness': 20, 'loseit': 20, 'nutrition': 20, 'xxfitness': 20, 'mentalhealth': 20, 'GetDisciplined': 20, 'DecidingToBeBetter': 20, 'adhdwomen': 20, 'Anxietyhelp': 20}

Avg comments per post: 617.22
{
     "submission_id": "83re4p",
     "domain_tag": "PH",
     "domain_name": "Physical Health",
     "subreddit": "loseit",
     "title": "So apparently my husband has a new girlfriend.",
     "selftext": "My husband is a member of a club that meets about twice monthly, and has been going to this club for about 6 years. I usually don't go because they're all talking about eye crossingly booring stuff (electronics and stuff), but every once in a while I go if they're doing something halfway interesting, just enough so that I know most of them at least a little bit. \n\nLast week was the monthly breakfast social at a diner that makes 

Fetching comments on score and relevance to advice


In [25]:
import time

MAX_COMMENTS = 20
REQUEST_DELAY = 1.2 # Important RL metric : NEED TO CHANGE LATER REMEMBER

start_time = time.time()
logger.info(f"üöÄ Starting comments Mining for {len(submissions)} submissions")

for i, submission in enumerate(submissions):
    sid = submission["submission_id"]
    title_preview = submission['title'][:50] + ("..." if len(submission['title']) > 50 else "")
    logger.info(f"[{i+1}/{len(submissions)}] Fetching comments for: {sid} ‚Äî {title_preview}")

    try:
        submission_praw = reddit.submission(id=sid)
        submission_praw.comment_sort = "top"
        submission_praw.comments.replace_more(limit=0)

        top_comments = []

        for comment in submission_praw.comments.list()[:MAX_COMMENTS]:
            # Filter junk
            body = (comment.body or "").strip()
            if not body or body in ("[deleted]", "[removed]"):
                continue
            if len(body) < 30:
                continue

            top_comments.append({
                "comment_id": comment.id,
                "body": body,
                "score": comment.score,
                "author": str(comment.author) if comment.author else None,
                "created_utc": comment.created_utc,
                "replies_count": len(comment.replies),
            })

        submission["top_comments"] = top_comments
        submission["num_fetched_comments"] = len(top_comments)
        logger.success(f"‚úÖ Stored {len(top_comments)} comments for {sid}")

    except Exception as e:
        logger.error(f"‚ùå Failed on submission {sid}: {str(e)}")
        submission["top_comments"] = []
        submission["num_fetched_comments"] = 0

    # API cooldown, otherwise everything will blow up
    time.sleep(REQUEST_DELAY)

    # Progress checkpoint every 25 submissions
    if (i + 1) % 25 == 0:
        elapsed = time.time() - start_time
        logger.info(f"Current Progress: {i+1}/{len(submissions)} | Elapsed: {elapsed:.1f}s")

# Summary
total_time = time.time() - start_time
logger.info(f"Completed comment fetching in: {total_time:.2f} seconds")

[32m2025-10-29 13:00:51.942[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1müöÄ Starting comments Mining for 254 submissions[0m
[32m2025-10-29 13:00:51.944[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1m[1/254] Fetching comments for: mldrr6 ‚Äî [UPDATE] I made my dad choose between me and his n...[0m
[32m2025-10-29 13:00:52.899[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [32m[1m‚úÖ Stored 20 comments for mldrr6[0m
[32m2025-10-29 13:00:54.100[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1m[2/254] Fetching comments for: mj358z ‚Äî I made my dad choose between me and his new family...[0m
[32m2025-10-29 13:00:54.713[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [32m[1m‚úÖ Stored 19 comments for mj358z[0m
[32m2025-10-29 13:00:55.915[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1m[3/