In [None]:
import os
import re
import json
from pathlib import Path
from typing import Iterator, Iterable, Dict, List, Tuple, Optional

import pandas as pd
import numpy as np
import glob

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except Exception as e:
    raise RuntimeError('pyarrow is required. install with: pip install pyarrow') from e

# display prefs
pd.set_option('display.max_colwidth', 300)


In [22]:
# configurations
DATA_DIR = Path('data')
COMMENTS_FILE = DATA_DIR / 'amitheasshole_comments.ndjson'
SUBMISSIONS_FILE = DATA_DIR / 'amitheasshole_submissions.ndjson'

remake_datafile=False

INTERIM_DIR = Path('interim')
ARTIFACTS_DIR = Path('artifacts')
for p in [INTERIM_DIR, ARTIFACTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# parquet dirs
SUBMISSIONS_PARQUET = INTERIM_DIR / 'submissions_minimal.parquet'
JOINED_DIR = INTERIM_DIR / 'joined_parquet'

# peek settings
PEEK_N = 1000 

# modelling sample size
SUBMISSION_SAMPLE_N = 1000
RANDOM_SEED = 42


In [23]:
# why: the comments file is ~22 GB; never load fully into RAM. stream lines lazily.

def iter_ndjson_lines(path: Path) -> Iterator[dict]:
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                # corrupted line guard; skip
                continue


def peek_ndjson(path: Path, n: int) -> pd.DataFrame:
    # why: small materialisation to learn schema
    rows = []
    for i, obj in enumerate(iter_ndjson_lines(path)):
        rows.append(obj)
        if i + 1 >= n:
            break
    return pd.DataFrame(rows)


def normalize_submission_id_from_link_id(link_id: str) -> Optional[str]:
    # why: comments store parent submission as 't3_<id>'; we need bare '<id>'
    if not link_id:
        return None
    parts = link_id.split('_', 1)
    return parts[1] if len(parts) == 2 else link_id


In [24]:
# why: you want to see the schema before committing to transforms
df_sub_peek = peek_ndjson(SUBMISSIONS_FILE, PEEK_N)
df_com_peek = peek_ndjson(COMMENTS_FILE, PEEK_N)

print('submissions columns:', sorted(df_sub_peek.columns.tolist()))
print('comments columns:', sorted(df_com_peek.columns.tolist()))

display(df_sub_peek.head(5))
display(df_com_peek.head(5))


submissions columns: ['all_awardings', 'allow_live_comments', 'archived', 'author', 'author_cakeday', 'author_created_utc', 'author_flair_background_color', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders', 'banned_by', 'call_to_action', 'can_gild', 'can_mod_post', 'category', 'content_categories', 'contest_mode', 'created_utc', 'discussion_type', 'distinguished', 'domain', 'edited', 'gilded', 'gildings', 'hidden', 'hide_score', 'id', 'is_created_from_ads_ui', 'is_crosspostable', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id', 'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked', 'media', 'media_embed', 'media_only', 'name', 'no_follow', 'nu

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,total_awards_received,treatment_tags,upvote_ratio,url,view_count,whitelist_status,wls,link_flair_template_id,call_to_action,author_cakeday
0,[],False,False,beanstressed,1627668000.0,,,[],,,...,0,[],0.71,https://www.reddit.com/r/AmItheAsshole/comments/v2fbg0/wibta_if_i_get_my_hair_braided/,,all_ads,6,,,
1,[],False,False,Good-Barracuda5143,1609642000.0,,,[],,,...,0,[],1.0,https://www.reddit.com/r/AmItheAsshole/comments/v2fdaf/aita_for_uninviting_a_best_friend_to_my_gender/,,all_ads,6,,,
2,[],True,False,[deleted],,,,,,,...,0,[],0.77,https://www.reddit.com/r/AmItheAsshole/comments/v2fdq0/aita_for_being_mad_at_my_friends_for_not_sticking/,,all_ads,6,20701dd2-d245-11e8-99f1-0e2d925c15f4,,
3,[],False,False,LisKoz1989,1654004000.0,,,[],,,...,0,[],0.9,https://www.reddit.com/r/AmItheAsshole/comments/v2fgt3/aita_for_never_wanting_to_see_a_guy_after_he_lied/,,all_ads,6,,,
4,[],False,False,[deleted],,,,,,,...,0,[],1.0,https://www.reddit.com/r/AmItheAsshole/comments/v2fh5n/aita_for_not_wanting_my_boyfriends_sister_to_live/,,all_ads,6,,,


Unnamed: 0,all_awardings,archived,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,top_awarded_type,total_awards_received,treatment_tags,unrepliable_reason,author_cakeday
0,[],False,,Apewash,1639883000.0,,,[],,,...,False,AmItheAsshole,t5_2xhvq,r/AmItheAsshole,public,,0,[],,
1,[],False,,Adventurous_House527,1623161000.0,,,[],,,...,False,AmItheAsshole,t5_2xhvq,r/AmItheAsshole,public,,0,[],,
2,[],False,,chunkytapioca,1649170000.0,,,[],,,...,False,AmItheAsshole,t5_2xhvq,r/AmItheAsshole,public,,0,[],,
3,[],False,,turtles_tszx,1503064000.0,,,[],,,...,False,AmItheAsshole,t5_2xhvq,r/AmItheAsshole,public,,0,[],,
4,[],False,,Boobear7676,1653867000.0,,,[],,,...,False,AmItheAsshole,t5_2xhvq,r/AmItheAsshole,public,,0,[],,


In [25]:
# why: shrink submissions to only the fields we need and store them in a fast columnar format for repeated joins/reads

SUB_FIELDS = ['id', 'title', 'selftext', 'link_flair_text', 'created_utc']

def stream_submissions_to_parquet(src: Path, dst: Path, fields=SUB_FIELDS, batch_size: int = 100_000):
    writer = None
    rows = []
    with src.open('r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            rows.append({k: obj.get(k) for k in fields})
            if len(rows) >= batch_size:
                df = pd.DataFrame(rows)
                table = pa.Table.from_pandas(df, preserve_index=False)
                if writer is None:
                    writer = pq.ParquetWriter(dst, table.schema)
                writer.write_table(table)
                rows.clear()
        if rows:
            df = pd.DataFrame(rows)
            table = pa.Table.from_pandas(df, preserve_index=False)
            if writer is None:
                writer = pq.ParquetWriter(dst, table.schema)
            writer.write_table(table)
    if writer is not None:
        writer.close()

if not SUBMISSIONS_PARQUET.exists():
    SUBMISSIONS_PARQUET.parent.mkdir(parents=True, exist_ok=True)
    stream_submissions_to_parquet(SUBMISSIONS_FILE, SUBMISSIONS_PARQUET)
    print('wrote:', SUBMISSIONS_PARQUET)
else:
    print('exists:', SUBMISSIONS_PARQUET)


exists: interim\submissions_minimal.parquet


In [26]:
# why: keep a compact in-RAM table for fast chunk joins; 900 MB NDJSON → much smaller Parquet subset
sub_df = pd.read_parquet(SUBMISSIONS_PARQUET, columns=['id', 'title', 'selftext', 'link_flair_text', 'created_utc'])
sub_df = sub_df.dropna(subset=['id']).drop_duplicates(subset=['id'])
sub_df = sub_df.set_index('id', drop=True)
approx_mb = sub_df.memory_usage(deep=True).sum() / 1e6
print('submissions frame:', sub_df.shape, f'~{approx_mb:.1f} MB in RAM')
display(sub_df.head(3))


submissions frame: (320671, 4) ~290.0 MB in RAM


Unnamed: 0_level_0,title,selftext,link_flair_text,created_utc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
v2fbg0,WIBTA if I get my hair braided,[removed],,1654084822
v2fdaf,AITA for uninviting a “best friend” to my gender reveal/housewarming party?,[removed],,1654084958
v2fdq0,AITA for being mad at my friends for not sticking to our agreements?,[deleted],Not enough info,1654084986


In [27]:
# why: join comments to their parent submissions without loading all comments into RAM; write results incrementally
JOINED_DIR.mkdir(parents=True, exist_ok=True)

def stream_join_comments(
    comments_path: Path,
    sub_index_df: pd.DataFrame,
    out_dir: Path,
    chunk_lines: int = 500_000,
    out_prefix: str = 'joined'
):
    buf = []
    file_idx = 0

    with comments_path.open('r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            if not line.strip():
                continue
            try:
                c = json.loads(line)
            except json.JSONDecodeError:
                continue

            sub_id = normalize_submission_id_from_link_id(c.get('link_id', ''))
            if not sub_id or sub_id not in sub_index_df.index:
                continue

            s = sub_index_df.loc[sub_id]
            buf.append({
                'submission_id': sub_id,
                'submission_title': s.get('title'),
                'submission_selftext': s.get('selftext'),
                'submission_flair': s.get('link_flair_text'),
                'submission_created_utc': s.get('created_utc'),
                'comment_id': c.get('id'),
                'comment_body': c.get('body'),
                'comment_created_utc': c.get('created_utc'),
                'comment_score': c.get('score'),
                'is_submitter': c.get('is_submitter'),
            })

            if len(buf) >= chunk_lines:
                df = pd.DataFrame(buf)
                pq.write_table(pa.Table.from_pandas(df, preserve_index=False),
                               out_dir / f'{out_prefix}_{file_idx:04d}.parquet')
                file_idx += 1
                buf.clear()

    if buf:
        df = pd.DataFrame(buf)
        pq.write_table(pa.Table.from_pandas(df, preserve_index=False),
                       out_dir / f'{out_prefix}_{file_idx:04d}.parquet')


# stream_join_comments(COMMENTS_FILE, sub_df, JOINED_DIR, chunk_lines=500_000)
# print('wrote joined parts in:', JOINED_DIR)


existing_parts = sorted(glob.glob(str(JOINED_DIR / 'joined_*.parquet')))

if remake_datafile or not existing_parts:
    # optional: clean old parts if forcing a rebuild
    if remake_datafile:
        for p in existing_parts:
            try:
                Path(p).unlink()
            except FileNotFoundError:
                pass

    stream_join_comments(COMMENTS_FILE, sub_df, JOINED_DIR, chunk_lines=500_000) # adjust chunk_lines to available RAM
    print('wrote joined parts in:', JOINED_DIR)
else:
    print(f'using existing joined parts ({len(existing_parts)} files) in:', JOINED_DIR)

using existing joined parts (29 files) in: interim\joined_parquet


In [28]:
# why: fuse joined parts for downstream analysis; for very large outputs, you can process per-part to keep RAM low
import glob

joined_parts = sorted(glob.glob(str(JOINED_DIR / 'joined_*.parquet')))
if not joined_parts:
    raise RuntimeError('no joined parquet parts found. rerun the previous cell.')

# if the number of parts is large or your RAM is tight, iterate parts instead of concatenating
joined_df = pd.concat([pd.read_parquet(p) for p in joined_parts], ignore_index=True)
print('joined shape:', joined_df.shape)
display(joined_df.head(3))


joined shape: (14492290, 10)


Unnamed: 0,submission_id,submission_title,submission_selftext,submission_flair,submission_created_utc,comment_id,comment_body,comment_created_utc,comment_score,is_submitter
0,v2fbg0,WIBTA if I get my hair braided,[removed],,1654084822,iaryyw3,^^^^AUTOMOD ***Thanks for posting! This comment is a copy of your post so readers can see the original text if your post is edited or removed. This comment is NOT accusing you of copying anything. Read [this](https://www.reddit.com/r/AmItheAsshole/wiki/faq#wiki_post_deletion) before [contacting...,1654084822,1,False
1,v2fdaf,AITA for uninviting a “best friend” to my gender reveal/housewarming party?,[removed],,1654084958,iarz6sa,"#READ THIS CAREFULLY BECAUSE WE WILL PERMANENTLY BAN YOU FOR VIOLATIONS\n\n\nYour post was removed because it exceeds the 3,000 character limit.\n\nPlease consider resubmitting a briefer post. You are **not allowed** to continue your post in the comments or another thread. **You will need to pos...",1654084958,1,False
2,v2fdq0,AITA for being mad at my friends for not sticking to our agreements?,[deleted],Not enough info,1654084986,iarz8fs,^^^^AUTOMOD ***Thanks for posting! This comment is a copy of your post so readers can see the original text if your post is edited or removed. This comment is NOT accusing you of copying anything. Read [this](https://www.reddit.com/r/AmItheAsshole/wiki/faq#wiki_post_deletion) before [contacting...,1654084987,1,False


In [29]:
# why: good cleaning + normalisation improves topic modelling and later mapping to themes

def ensure_spacy(nlp_name: str = 'en_core_web_sm'):
    try:
        return spacy.load(nlp_name, disable=['parser', 'textcat'])
    except OSError as e:
        raise RuntimeError(
            f'spaCy model {nlp_name!r} not installed. run: python -m spacy download {nlp_name}'
        ) from e

NLP = ensure_spacy()

URL_RE = re.compile(r'https?://\S+|www\.\S+')
NONWORD_RE = re.compile(r"[^a-zA-Z']+")
MULTISPACE_RE = re.compile(r'\s+')

def clean_text(text: Optional[str]) -> str:
    if not text:
        return ''
    text = URL_RE.sub(' ', text)
    text = text.lower()
    text = NONWORD_RE.sub(' ', text)
    text = MULTISPACE_RE.sub(' ', text).strip()
    return text

def lemmatize(text: str, nlp=NLP, do_ner: bool = False) -> Tuple[str, List[str]]:
    if not text:
        return '', []
    doc = nlp(text)
    lemmas = [t.lemma_ for t in doc if not (t.is_stop or t.is_punct or t.is_space)]
    ents = [f'{ent.label_}:{ent.text}' for ent in doc.ents] if do_ner else []
    return ' '.join(lemmas), ents

def preprocess_submission_row(row: dict, do_ner: bool = True) -> dict:
    raw = ' '.join([str(row.get('title') or ''), str(row.get('selftext') or '')]).strip()
    cleaned = clean_text(raw)
    lemmas, ents = lemmatize(cleaned, do_ner=do_ner)
    return {
        'id': row.get('id'),
        'flair': row.get('link_flair_text'),
        'created_utc': row.get('created_utc'),
        'text_raw': raw,
        'text_clean': cleaned,
        'text_lemmas': lemmas,
        'ents': ents
    }


In [30]:
# why: many posts have selftext '[removed]'/'[deleted]' or empty; topic modelling needs actual text

REMOVED_MARKERS = {'[removed]', '[deleted]', None, ''}

def is_removed(txt):
    return (txt is None) or (str(txt).strip() in REMOVED_MARKERS)

def text_len(s):
    return 0 if s is None else len(str(s).strip())

# basic diagnostics on your submissions index frame
diag = pd.DataFrame({
    'has_body': ~sub_df['selftext'].apply(is_removed),
    'title_len': sub_df['title'].apply(text_len),
    'body_len': sub_df['selftext'].apply(text_len),
    'flair_none': sub_df['link_flair_text'].isna()
})
print('Total submissions:', len(sub_df))
print('With usable body:', int(diag['has_body'].sum()))
print('Flair available:', int((~diag["flair_none"]).sum()))
display(diag.describe())

# build a "usable" subset:
# keep if (body is usable and body_len >= 50) OR (title_len >= 40)
usable_mask = (~sub_df['selftext'].apply(is_removed) & (sub_df['selftext'].apply(text_len) >= 50)) | (sub_df['title'].apply(text_len) >= 40)
usable_sub_df = sub_df.loc[usable_mask, ['title','selftext','link_flair_text','created_utc']].copy()
print('Usable submissions:', len(usable_sub_df))

# for repeatable sampling
USABLE_SUBMISSIONS_PARQUET = INTERIM_DIR / 'submissions_usable.parquet'
usable_sub_df.to_parquet(USABLE_SUBMISSIONS_PARQUET, index=True)
print('Saved:', USABLE_SUBMISSIONS_PARQUET)


Total submissions: 320671
With usable body: 68011
Flair available: 79223


Unnamed: 0,title_len,body_len
count,320671.0,320671.0
mean,61.227096,420.817483
std,25.048349,883.220962
min,20.0,0.0
25%,44.0,9.0
50%,57.0,9.0
75%,74.0,9.0
max,311.0,23331.0


Usable submissions: 275324
Saved: interim\submissions_usable.parquet


In [31]:
# why: sample from text that’s actually usable, so LDA/NER have signal

import numpy as np
import pandas as pd

def fetch_random_submissions_df_from_usable(usable_index_df: pd.DataFrame, n: int, seed: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    ids = usable_index_df.index.values
    pick = ids if n >= len(ids) else rng.choice(ids, size=n, replace=False)
    df = usable_index_df.loc[pick, ['title', 'selftext', 'link_flair_text', 'created_utc']].reset_index()
    df = df.rename(columns={'index': 'id'})
    return df

def build_corpus(df_sub: pd.DataFrame, do_ner: bool = True) -> pd.DataFrame:
    processed = [preprocess_submission_row(row, do_ner=do_ner) for row in df_sub.to_dict('records')]
    return pd.DataFrame(processed)

# IMPORTANT: sample from usable_sub_df, not sub_df
df_sub_sample = fetch_random_submissions_df_from_usable(usable_sub_df, SUBMISSION_SAMPLE_N, seed=RANDOM_SEED)
print('Submissions sampled for modelling (usable only):', len(df_sub_sample))
display(df_sub_sample.head(3))

df_corpus = build_corpus(df_sub_sample, do_ner=True)
display(df_corpus.head(3)[['id', 'flair', 'text_raw']])


Submissions sampled for modelling (usable only): 1000


Unnamed: 0,id,title,selftext,link_flair_text,created_utc
0,yiud1m,AITA for carding everyone in a party coming into a liquor store?,[removed],,1667264304
1,yvcwyh,AITA for calling out my friend for ditching me for tennis then playing with other people instead,"Throwaway account here. \n\nI (26m) was supposed to play tennis with one of my best friends ""Drew"" (26m) and his brother ""Ivan""(29m) a few weekends ago. On the morning of, Drew sent a message saying he was too hungover to come down (20-25 minute drive), so the three of us made plans to play the ...",Not the A-hole,1668460623
2,x3iyei,AITA for not sleeping on the couch?,"My GF and I had our first ever major argument last night and she basically told me to gtfo from our room and go sleep on the couch. I refused which ultimately lead to a screaming match between the two of us, ending up in her leaving our apartment and spending the night at one of her friend's ins...",Not the A-hole,1662067240


Unnamed: 0,id,flair,text_raw
0,yiud1m,,AITA for carding everyone in a party coming into a liquor store? [removed]
1,yvcwyh,Not the A-hole,"AITA for calling out my friend for ditching me for tennis then playing with other people instead Throwaway account here. \n\nI (26m) was supposed to play tennis with one of my best friends ""Drew"" (26m) and his brother ""Ivan""(29m) a few weekends ago. On the morning of, Drew sent a message saying ..."
2,x3iyei,Not the A-hole,"AITA for not sleeping on the couch? My GF and I had our first ever major argument last night and she basically told me to gtfo from our room and go sleep on the couch. I refused which ultimately lead to a screaming match between the two of us, ending up in her leaving our apartment and spending ..."


In [32]:
# why: RQ2 needs concrete buckets; we map discovered topics to five categories using keyword overlaps

CATEGORY_KEYWORDS = {
    'finances': {
        'money','pay','paid','rent','bill','bills','loan','debt','card','credit','cash','salary','bonus','split','cost','expensive','cheap','wedding','gift','refund','share','finance'
    },
    'relationship': {
        'relationship','boyfriend','girlfriend','partner','date','dating','romantic','love','cheat','ex','fiancé','fiance','fiancee','breakup','trust','jealous'
    },
    'family_conflict': {
        'mom','dad','mother','father','sister','brother','siblings','family','cousin','aunt','uncle','inlaws','in','law','grandma','grandpa','child','baby','pregnant','wedding','name'
    },
    'work': {
        'work','job','boss','coworker','manager','shift','hours','office','remote','payroll','promotion','hr','fire','fired','leave','paternity','maternity'
    },
    'societal_norms': {
        'culture','religion','religious','tradition','gender','pronoun','politics','law','legal','illegal','discrimination','racist','ableist','ethics','value','norm','boundary','consent'
    }
}

def score_topic_to_category(words: List[str]) -> Tuple[str, Dict[str, int]]:
    scores = {cat: 0 for cat in CATEGORY_KEYWORDS}
    wordset = set(words)
    for cat, kw in CATEGORY_KEYWORDS.items():
        scores[cat] = len(wordset & kw)
    best_cat = max(scores, key=scores.get)
    return best_cat, scores

topic_category = []
for i, words in enumerate(topics_top_words):
    best, scores = score_topic_to_category(words)
    topic_category.append({'topic': i, 'category': best, **scores})

df_topic_map = pd.DataFrame(topic_category).sort_values(['category', 'topic'])
display(df_topic_map)


NameError: name 'topics_top_words' is not defined

In [None]:
# why: per-submission labels allow counts, examples, and trends to answer RQ2 precisely

topic_labels = np.argmax(W, axis=1)
df_corpus['topic'] = topic_labels

topic_to_cat = {row['topic']: row['category'] for _, row in df_topic_map.iterrows()}
df_corpus['category_initial'] = df_corpus['topic'].map(topic_to_cat).fillna('societal_norms')

def ner_bias_category(ents: List[str], current: str) -> str:
    labels = [e.split(':', 1)[0] for e in ents]
    if any(lbl in ('NORP', 'LAW') for lbl in labels) and current in ('relationship', 'work', 'finances'):
        return 'societal_norms'
    if any(lbl in ('PERSON',) for lbl in labels) and current == 'societal_norms':
        return 'family_conflict'
    return current

df_corpus['category'] = [
    ner_bias_category(ents, cat) for ents, cat in zip(df_corpus['ents'], df_corpus['category_initial'])
]

category_counts = df_corpus['category'].value_counts().rename_axis('category').reset_index(name='count')
display(category_counts)


In [None]:
# why: quick human check to see if buckets make sense before scaling up

def examples_by_category(df: pd.DataFrame, cat: str, k: int = 5) -> pd.DataFrame:
    ex = df.loc[df['category'] == cat, ['id', 'flair', 'text_raw']].head(k).copy()
    return ex

for cat in ['finances', 'relationship', 'family_conflict', 'work', 'societal_norms']:
    print(f'\n=== {cat.upper()} EXAMPLES ===')
    display(examples_by_category(df_corpus, cat, k=5))
