In [1]:
# =========================
# Reddit A+B+C (Gaming Only) Dataset Collector
# Adds a GAME label for every POST and every COMMENT (inherits from parent post)
# Use cases:
#   A = image_generation
#   B = gameplay_ai
#   C = moderation_ai
# Output:
#   FINAL_gaming_ai_use_cases_with_game.csv
#   FINAL_gaming_ai_use_cases_with_game.parquet
# =========================

# 0) Install (Colab/Jupyter)
!pip -q install asyncpraw nest_asyncio pandas pyarrow

import os
import re
import math
import asyncio
import pandas as pd
import nest_asyncio
import asyncpraw
from datetime import datetime

nest_asyncio.apply()

# -------------------------
# 1) Reddit client (SAFE)
# -------------------------
reddit_read_only = asyncpraw.Reddit(
    client_id="mIZP0PVhSvc4PMZ0-9IXoA",
    client_secret="BVoslLbuJWJYsE32I9vlw6jQEL-1Ig",
    user_agent="BThesisAIGenContentRed"
)

# If you want to hardcode (NOT recommended), uncomment and fill:
# reddit = asyncpraw.Reddit(client_id="...", client_secret="...", user_agent="BThesisAIGenContentRed")

# -------------------------
# 2) Subreddits (gaming only)
# -------------------------
GAMING_SUBREDDITS = [
    "gaming", "Games", "pcgaming",
    "CallOfDuty", "blackops", "Warzone",
    "gamedev", "truegaming"
]

# -------------------------
# 3) Queries for A, B, C
# Keep them tight to avoid explosion.
# -------------------------
QUERIES_A = [
    # A) AI image generation in games (art/assets/cosmetics/calling cards)
    '("AI art" OR "AI-generated" OR "generative AI" OR "generated images") (game OR gaming)',
    '("calling card" OR "calling cards" OR cosmetic OR skin) (AI OR "AI-generated" OR "generative")',
    '(AI OR "generated") (art OR images OR assets OR texture OR cosmetics) (game OR gaming)'
]

QUERIES_B = [
    # B) Gameplay / NPC AI (immersion, difficulty, fairness)
    '("AI NPC" OR "enemy AI" OR "smart enemies" OR "adaptive AI")',
    '(AI) (difficulty OR unfair OR cheating OR immersion OR "rubber band")',
    '("AI gameplay" OR "combat AI" OR "companion AI")'
]

QUERIES_C = [
    # C) AI moderation / anti-cheat (bans, false bans, voice moderation)
    '("AI ban" OR "automated ban" OR "AI moderation") (game OR gaming OR anti-cheat)',
    '(AI) (anti-cheat OR cheating OR "false ban" OR "shadowban")',
    '("AI voice moderation" OR "voice chat moderation" OR "AI report system")'
]

# -------------------------
# 4) Game inference patterns
# Start with a strong base; add more as you discover games in your dataset.
# -------------------------
# -------------------------
# 4) Expanded Game Inference Patterns
# -------------------------
GAME_PATTERNS = {
    "Call of Duty": r"\b(call of duty|cod|black ops|bo\d|warzone|modern warfare|mw\d)\b",
    "Battlefield": r"\b(battlefield|bf6|bf2042|dice games)\b",
    "Clair Obscur: Expedition 33": r"\b(clair obscur|expedition 33|expedition33|sandfall)\b",
    "Arc Raiders": r"\b(arc raiders|embark studios)\b",
    "Monster Hunter": r"\b(monster hunter|mh wilds|mhwilds|capcom)\b",
    "Kingdom Come": r"\b(kingdom come|kcd2|deliverance)\b",
    "Assassin's Creed": r"\b(assassin'?s creed|ac shadows|ubisoft)\b",
    "Civilization": r"\b(civilization|civ 7|civ7|firaxis)\b",
    "Dragon Age": r"\b(dragon age|the veilguard|bioware)\b",
    "Fortnite": r"\b(fortnite|epic games)\b",
    "GTA": r"\b(gta\b|grand theft auto|rockstar)\b",
    "Minecraft": r"\b(minecraft|mojang)\b",
    "Roblox": r"\b(roblox)\b",
    "Valorant": r"\b(valorant|riot games)\b",
    "Overwatch": r"\b(overwatch|blizzard)\b",
    "Apex Legends": r"\b(apex legends|respawn)\b",
    "League of Legends": r"\b(league of legends|\blol\b)\b",
    "Counter-Strike": r"\b(counter-?strike|cs2|cs:go)\b",
}

compiled_game_patterns = [(g, re.compile(pat, re.IGNORECASE)) for g, pat in GAME_PATTERNS.items()]

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = re.sub(r"http\S+", "", s)          # remove URLs
    s = re.sub(r"\s+", " ", s).strip()     # normalize spaces
    return s

def infer_game(text: str, subreddit: str = "") -> str:
    combined = f"{subreddit} {text or ''}"
    for game, pat in compiled_game_patterns:
        if pat.search(combined):
            return game
    return "Unknown/General"

# Optional: basic "reaction language" flag (useful for later analysis)
REACTION_TERMS = re.compile(
    r"\b(love|hate|trash|garbage|lazy|cool|awesome|cringe|scam|unfair|"
    r"stolen|copyright|plagiarism|ethics|soulless|low effort|ai slop|"
    r"boycott|backlash|disgusting|amazing|beautiful|false ban|banned)\b",
    flags=re.IGNORECASE
)

def has_reaction_terms(text: str) -> int:
    return 1 if REACTION_TERMS.search(text or "") else 0

# -------------------------
# 5) Collector (posts + top comments)
# Each comment inherits game from its parent post.
# -------------------------
async def collect_use_case_with_game(
    reddit_client: asyncpraw.Reddit,
    subreddits: list[str],
    queries: list[str],
    ai_use_case: str,
    time_filter: str = "year",
    sort: str = "new",
    max_posts_per_query: int = 15,
    max_comments_per_post: int = 20,
    comment_sort: str = "top"
) -> list[dict]:
    rows = []

    for sub in subreddits:
        subreddit = await reddit_client.subreddit(sub)

        for q in queries:
            search_gen = subreddit.search(
                query=q,
                sort=sort,
                time_filter=time_filter,
                limit=max_posts_per_query
            )

            async for post in search_gen:
                title = clean_text(post.title or "")
                selftext = clean_text(post.selftext or "")
                post_text = clean_text(f"{title}\n{selftext}".strip())
                game = infer_game(post_text, subreddit=sub)

                # Post row
                rows.append({
                    "ai_use_case": ai_use_case,
                    "game": game,
                    "subreddit": sub,
                    "query_used": q,
                    "kind": "post",
                    "post_id": post.id,
                    "comment_id": None,
                    "parent_post_id": None,
                    "created_utc": post.created_utc,
                    "score": post.score,
                    "num_comments": post.num_comments,
                    "permalink": f"https://www.reddit.com{post.permalink}",
                    "title": title,
                    "text": selftext,
                    "combined_text": post_text,
                    "has_reaction_terms": has_reaction_terms(post_text)
                })

                # Comments (inherit game)
                try:
                    # Note: post is a Submission; we can set comment sort like this
                    post.comment_sort = comment_sort
                    await post.comments.replace_more(limit=0)
                    comment_list = post.comments.list()[:max_comments_per_post]

                    for c in comment_list:
                        body = clean_text(getattr(c, "body", "") or "")
                        if not body:
                            continue

                        rows.append({
                            "ai_use_case": ai_use_case,
                            "game": game,  # <-- inherited from parent post
                            "subreddit": sub,
                            "query_used": q,
                            "kind": "comment",
                            "post_id": post.id,
                            "comment_id": getattr(c, "id", None),
                            "parent_post_id": post.id,
                            "created_utc": getattr(c, "created_utc", None),
                            "score": getattr(c, "score", None),
                            "num_comments": None,
                            "permalink": f"https://www.reddit.com{getattr(c, 'permalink', '')}" if getattr(c, "permalink", None) else f"https://www.reddit.com{post.permalink}",
                            "title": None,
                            "text": body,
                            "combined_text": body,
                            "has_reaction_terms": has_reaction_terms(body)
                        })
                except Exception:
                    # keep going; don't crash the run if comment fetch fails
                    pass
                await asyncio.sleep(0.5) # Small delay after processing each post's comments
            await asyncio.sleep(1) # Delay after processing each query
        await asyncio.sleep(2) # Delay after processing each subreddit

    return rows

# -------------------------
# 6) Run A + B + C with caps (keeps dataset manageable)
# -------------------------
async def run_abc_collection():
    rows = []

    # Tune these to control dataset size:
    # - increase max_posts_per_query if you need more data
    # - increase max_comments_per_post for richer "social reaction"
    max_posts_per_query = 15
    max_comments_per_post = 20

    rows += await collect_use_case_with_game(
        reddit_client=reddit_read_only,
        subreddits=GAMING_SUBREDDITS,
        queries=QUERIES_A,
        ai_use_case="image_generation",
        max_posts_per_query=max_posts_per_query,
        max_comments_per_post=max_comments_per_post
    )

    rows += await collect_use_case_with_game(
        reddit_client=reddit_read_only,
        subreddits=GAMING_SUBREDDITS,
        queries=QUERIES_B,
        ai_use_case="gameplay_ai",
        max_posts_per_query=max_posts_per_query,
        max_comments_per_post=max_comments_per_post
    )

    rows += await collect_use_case_with_game(
        reddit_client=reddit_read_only,
        subreddits=GAMING_SUBREDDITS,
        queries=QUERIES_C,
        ai_use_case="moderation_ai",
        max_posts_per_query=max_posts_per_query,
        max_comments_per_post=max_comments_per_post
    )

    df = pd.DataFrame(rows)

    if not df.empty:
        # Deduplicate (important because searches can overlap)
        df = df.drop_duplicates(subset=["kind", "post_id", "comment_id", "ai_use_case", "subreddit"], keep="first")

        # Convert time
        df["created_dt"] = pd.to_datetime(df["created_utc"], unit="s", utc=True, errors="coerce")

        # Text length
        df["text_len"] = df["combined_text"].fillna("").str.len()

        # Add log metrics (helpful later)
        df["score_filled"] = df["score"].fillna(0)
        df["log_score"] = (df["score_filled"] + 1).apply(lambda x: math.log(x))

    return df

df = asyncio.run(run_abc_collection())

print("Dataset shape:", df.shape)
print("\nCounts by use case + kind:")
print(df.groupby(["ai_use_case","kind"]).size())

print("\nTop games detected:")
print(df["game"].value_counts().head(20))

# -------------------------
# 7) Save outputs
# -------------------------
out_csv = "FINAL_gaming_ai_use_cases_with_game.csv"
out_parquet = "FINAL_gaming_ai_use_cases_with_game.parquet"

df.to_csv(out_csv, index=False)
df.to_parquet(out_parquet, index=False)

print("\nSaved:")
print("-", out_csv)
print("-", out_parquet)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/196.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-adk 1.21.0 requires aiosqlite>=0.21.0, but you have aiosqlite 0.17.0 which is incompatible.[0m[31m
[0mDataset shape: (334, 20)

Counts by use case + kind:
ai_use_case       kind
gameplay_ai       post    120
image_generation  post    178
moderation_ai     post     36
dtype: int64

Top games detected:
game
Unknown/General                195
Call of Duty                    74
Civilization                     9
Battlefield                      8
Clair Obscur: Expedition 33      6
League of Legends                6
Fortnite                         5
Assassin's Creed   