# Check

In [76]:
import pandas as pd

# Load posts
posts = pd.read_csv(
    "/home/ec2-user/ambivalent/data/Ambivalent_Data_Final.csv",
    dtype={"post_id": str}
)

# Load comments
comments = pd.read_csv(
    "/home/ec2-user/ambivalent/data/Ambivalent_Comments_Final.csv",
    dtype={"post_id": str}
)

print("Posts shape:", posts.shape)
print("Comments shape:", comments.shape)

print("\nPosts columns:")
print(posts.columns)

print("\nComments columns:")
print(comments.columns)


Posts shape: (1471, 14)
Comments shape: (5614, 7)

Posts columns:
Index(['post_id', 'subreddit', 'title', 'author', 'score',
       'num_comments_listed', 'op_replied', 'op_reply_count', 'created_utc',
       'permalink', 'body', 'combined_text', 'matched_ambivalent_phrases',
       'matched_keywords'],
      dtype='object')

Comments columns:
Index(['post_id', 'parent_id', 'depth', 'comment_author', 'comment_body',
       'comment_score', 'is_post_author'],
      dtype='object')


In [81]:
import math

def parse_parent(pid):
    """
    Parse a parent_id like 'UserName@2' into (parent_author, parent_depth).
    Returns (None, None) if it is missing or malformed.
    """
    # Handle NaN or missing
    if isinstance(pid, float) and math.isnan(pid):
        return None, None
    if not isinstance(pid, str) or '@' not in pid:
        return None, None
    
    # Split text like "UserName@2" into "UserName" and "2"
    a, d = pid.split('@', 1)
    
    # Try to convert "2" into an integer
    try:
        d_int = int(d)
    except ValueError:
        d_int = None
    
    return a, d_int

# Apply this to the whole parent_id column
comments["parent_author"], comments["parent_depth"] = zip(
    *comments["parent_id"].map(parse_parent)
)

# Make sure depth columns are integer-like (nullable Int64)
comments["parent_depth"] = comments["parent_depth"].astype("Int64")
comments["depth"] = comments["depth"].astype("Int64")


In [85]:
# 1) Find all OP replies (comments written by post author replying to someone)
op_replies = comments[
    (comments["is_post_author"]) & comments["parent_author"].notna()
].copy()
print("Total OP replies to someone:", len(op_replies))

# 2) Build a table of all comments that are NOT by OP (these are potential parents)
parents = comments[~comments["is_post_author"]].copy()

# Make sure depth is integer
parents["depth"] = parents["depth"].astype("Int64")

# 3) Build a key that uniquely identifies each parent comment:
#    key = post_id || comment_author || depth
parents["parent_key"] = (
    parents["post_id"].astype(str)
    + "||" + parents["comment_author"].astype(str)
    + "||" + parents["depth"].astype(str)
)

# 4) For each OP reply, build the matching key of the comment they replied to
op_replies["parent_depth"] = op_replies["parent_depth"].astype("Int64")
op_replies["parent_key"] = (
    op_replies["post_id"].astype(str)
    + "||" + op_replies["parent_author"].astype(str)
    + "||" + op_replies["parent_depth"].astype(str)
)

# 5) Collect all keys that OP replied to into a set (fast lookup)
replied_keys = set(op_replies["parent_key"].dropna())

# 6) For every non-OP comment, mark if OP replied directly to it
parents["op_replied_here"] = parents["parent_key"].isin(replied_keys)

print("\nInitial counts for op_replied_here:")
print(parents["op_replied_here"].value_counts())


Total OP replies to someone: 1068

Initial counts for op_replied_here:
op_replied_here
False    3381
True     1096
Name: count, dtype: int64


In [88]:
# 1) Give each comment an index inside its post (0,1,2... in the order they appear)
comments["within_post_idx"] = comments.groupby("post_id").cumcount()

# Rebuild parents and op_replies with this index
parents = comments[~comments["is_post_author"]].copy()
parents["depth"] = parents["depth"].astype("Int64")
parents["within_post_idx"] = parents.groupby("post_id").cumcount()

op_replies = comments[
    (comments["is_post_author"]) & comments["parent_author"].notna()
].copy()
op_replies["parent_depth"] = op_replies["parent_depth"].astype("Int64")
op_replies["within_post_idx"] = op_replies.groupby("post_id").cumcount()

# Parent keys again
parents["parent_key"] = (
    parents["post_id"].astype(str)
    + "||" + parents["comment_author"].astype(str)
    + "||" + parents["depth"].astype(str)
)

op_replies["parent_key"] = (
    op_replies["post_id"].astype(str)
    + "||" + op_replies["parent_author"].astype(str)
    + "||" + op_replies["parent_depth"].astype(str)
)

# 2) For each parent comment, get the earliest index where OP replied
min_reply_idx = (
    op_replies.groupby("parent_key")["within_post_idx"]
    .min()
    .rename("op_reply_idx")
)

# 3) Attach this to parents
parents = parents.merge(min_reply_idx, on="parent_key", how="left")

# 4) Compute gap between when comment was posted and when OP replied (in order steps)
parents["reply_index_gap"] = parents["op_reply_idx"] - parents["within_post_idx"]


In [90]:
import re

def text_flags(text):
    if not isinstance(text, str):
        text = ""

    t = text.lower()

    return pd.Series({
        # kindness / support
        "has_thank": bool(re.search(r"\b(thanks|thank you|appreciate)\b", t)),
        "has_sorry": bool(re.search(r"\b(sorry|i'm sorry|im sorry)\b", t)),
        "has_validation": bool(re.search(
            r"\b(you did nothing wrong|not your fault|you were assaulted|you were raped)\b",
            t
        )),

        # harshness / blame
        "has_blaming": bool(re.search(
            r"\b(you should have|your fault|why didn.?t you|why didn.?t u)\b",
            t
        )),
        "has_minimize": bool(re.search(
            r"\b(overreacting|not that bad|wasn.?t really rape|doesn.?t count)\b",
            t
        )),

        # questions (often probing / clarifying)
        "has_question": "?" in t,

        # length features
        "len_chars": len(t),
        "len_words": len(t.split()),
    })

# Apply to all non-OP comments (parents table)
# Reset index so alignment is clean
parents = parents.reset_index(drop=True)
features = parents["comment_body"].apply(text_flags).reset_index(drop=True)

# Assign features column by column (avoids duplicate columns)
for col in features.columns:
    parents[col] = features[col]


In [94]:
# 1) Optional: drop duplicate columns if any
parents = parents.loc[:, ~parents.columns.duplicated()]

# 2) Create the boolean flag: did OP reply to this parent comment?
parents["op_replied_here"] = parents["op_reply_idx"].notna()

# 3) Quick sanity check
print("\nCounts for op_replied_here:")
print(parents["op_replied_here"].value_counts())

print("\nFirst few rows:")
print(parents[[
    "post_id", "comment_author", "comment_body",
    "comment_score", "depth", "within_post_idx",
    "op_reply_idx", "reply_index_gap", "op_replied_here",
    "has_validation", "has_blaming", "has_minimize", "has_question",
    "len_words"
]].head(10))



Counts for op_replied_here:
op_replied_here
False    3381
True     1096
Name: count, dtype: int64

First few rows:
   post_id        comment_author  \
0  1hv4zdy      Dear-Clothes3568   
1  1hv4zdy     SortDifferent2481   
2  1hv4zdy            HowMusikal   
3  1f2kibx               larelya   
4  1f2kibx               larelya   
5  1ccnt06        Dellynightmare   
6  1ccnt06  Waste_Translator_335   
7  1c09z00        Hello_Hangnail   
8  1bautv8       Coolcucumber415   
9  1bautv8      DanielletheMoran   

                                        comment_body  comment_score  depth  \
0  Both SA.\n\nP.S. Keep away with crazy people. ...              3      0   
1  I would say hers is forced because she was lit...              3      0   
2  As a woman who has been SA’d multiple times, I...              2      0   
3  Oh dear, that's a tough series of events to ha...              2      0   
4  I can relate to that desire to make sense of i...              2      2   
5  &gt;made me go i

In [100]:
# Basic info
print(parents.shape)
print(parents.columns)

# How many comments did OP reply / not reply to?
print("\nCounts for op_replied_here:")
print(parents["op_replied_here"].value_counts())

print("\nFirst few rows:")
parents[[
    "post_id", "comment_author", "comment_body",
    "comment_score", "depth", "within_post_idx",
    "op_replied_here", "reply_index_gap",
    "has_validation", "has_blaming", "has_minimize",
    "has_question", "len_words"
]].head(10)


(4477, 22)
Index(['post_id', 'parent_id', 'depth', 'comment_author', 'comment_body',
       'comment_score', 'is_post_author', 'parent_author', 'parent_depth',
       'within_post_idx', 'parent_key', 'op_reply_idx', 'reply_index_gap',
       'op_replied_here', 'has_thank', 'has_sorry', 'has_validation',
       'has_blaming', 'has_minimize', 'has_question', 'len_chars',
       'len_words'],
      dtype='object')

Counts for op_replied_here:
op_replied_here
False    3381
True     1096
Name: count, dtype: int64

First few rows:


Unnamed: 0,post_id,comment_author,comment_body,comment_score,depth,within_post_idx,op_replied_here,reply_index_gap,has_validation,has_blaming,has_minimize,has_question,len_words
0,1hv4zdy,Dear-Clothes3568,Both SA.\n\nP.S. Keep away with crazy people. ...,3,0,0,True,0.0,False,False,False,False,13
1,1hv4zdy,SortDifferent2481,I would say hers is forced because she was lit...,3,0,1,True,0.0,False,False,False,False,71
2,1hv4zdy,HowMusikal,"As a woman who has been SA’d multiple times, I...",2,0,2,True,0.0,False,False,False,True,115
3,1f2kibx,larelya,"Oh dear, that's a tough series of events to ha...",2,0,0,True,0.0,False,False,False,True,111
4,1f2kibx,larelya,I can relate to that desire to make sense of i...,2,2,1,False,,False,False,False,True,124
5,1ccnt06,Dellynightmare,&gt;made me go in the closet\n\nWhat do you me...,1,0,0,True,0.0,False,False,False,True,39
6,1ccnt06,Waste_Translator_335,It was SA either way. Dont justify it. OP was ...,1,1,1,False,,False,False,False,False,13
7,1c09z00,Hello_Hangnail,"Yes, it was assault. You specifically told him...",3,0,0,False,,False,False,False,False,133
8,1bautv8,Coolcucumber415,"yes, this is assault, a person cannot give ent...",5,0,0,False,,False,True,False,False,46
9,1bautv8,DanielletheMoran,"I’m so sorry this happened to you, I definitel...",1,0,1,False,,False,False,False,False,24


In [105]:
import pandas as pd

# Make sure the boolean-like columns are actual bool, no NaN
bool_cols = [
    "has_validation",
    "has_blaming",
    "has_minimize",
    "has_question",
    "op_replied_here",
]

for col in bool_cols:
    parents[col] = parents[col].fillna(False).astype(bool)

# 1) How often does OP reply when there is validation?
print("Validation vs OP reply:")
print(pd.crosstab(
    parents["has_validation"],
    parents["op_replied_here"],
    normalize="index"  # show percentages within each row
))

# 2) How often does OP reply to blaming comments?
print("\nBlaming vs OP reply:")
print(pd.crosstab(
    parents["has_blaming"],
    parents["op_replied_here"],
    normalize="index"
))

# 3) How often does OP reply when the comment minimizes?
print("\nMinimize vs OP reply:")
print(pd.crosstab(
    parents["has_minimize"],
    parents["op_replied_here"],
    normalize="index"
))

# 4) How often does OP reply to comments that ask questions?
print("\nQuestion vs OP reply:")
print(pd.crosstab(
    parents["has_question"],
    parents["op_replied_here"],
    normalize="index"
))


Validation vs OP reply:
op_replied_here     False     True 
has_validation                     
False            0.756923  0.243077
True             0.726190  0.273810

Blaming vs OP reply:
op_replied_here     False     True 
has_blaming                        
False            0.757937  0.242063
True             0.715278  0.284722

Minimize vs OP reply:
op_replied_here     False     True 
has_minimize                       
False            0.756092  0.243908
True             0.666667  0.333333

Question vs OP reply:
op_replied_here     False     True 
has_question                       
False            0.741853  0.258147
True             0.804852  0.195148


In [109]:
numeric_cols = [
    "len_words",          # length of the comment
    "comment_score",      # upvotes
    "within_post_idx",    # earlier vs later in thread
    "reply_index_gap"     # how many comments later OP replied (NaN if no reply)
]

group_means = parents.groupby("op_replied_here")[numeric_cols].mean()
print(group_means)


                 len_words  comment_score  within_post_idx  reply_index_gap
op_replied_here                                                            
False            62.477669       3.317657        10.233363              NaN
True             76.981752       6.378650         6.843978        -4.906934


In [112]:
def show_examples(mask, n=3, label=""):
    print("\n===== EXAMPLES:", label, "=====")
    count = int(mask.sum())
    if count == 0:
        print("No examples found for this condition.")
        return

    sample = parents[mask].sample(min(n, count), random_state=42)
    for _, row in sample.iterrows():
        print("\n---")
        print("Subreddit:", row.get("subreddit", "NA"))
        print("Post ID:", row["post_id"])
        print("Comment author:", row["comment_author"])
        print("Comment score:", row["comment_score"])
        print("within_post_idx:", row["within_post_idx"])
        print("OP replied here:", row["op_replied_here"])
        print("Comment body:\n", row["comment_body"][:500], "\n")

# 1) Validating comments OP replied to
mask1 = (parents["has_validation"]) & (parents["op_replied_here"])
show_examples(mask1, n=3, label="Validating comments that OP replied to")

# 2) Validating comments OP ignored
mask2 = (parents["has_validation"]) & (~parents["op_replied_here"])
show_examples(mask2, n=3, label="Validating comments that OP did NOT reply to")



===== EXAMPLES: Validating comments that OP replied to =====

---
Subreddit: NA
Post ID: k5wyvy
Comment author: [deleted]
Comment score: 5
within_post_idx: 6
OP replied here: True
Comment body:
 No doubt this is assault. You told her no and she blackmailed you. That is not consent. I'm so sorry. Please remember that this was not your fault and you did nothing wrong. It's totally okay to feel hurt and upset about this, but you are not alone. 


---
Subreddit: NA
Post ID: jafsse
Comment author: Lezbehonest2288
Comment score: 3
within_post_idx: 0
OP replied here: True
Comment body:
 If you feel you were assaulted then that's what it was. Your 13 dont try and fix anyone, you need to work on yourself first. I get that he comes from a crap home life but that's not your responsibility that's an adult's responsibility. Eventually you will learn all of this and hopefully you have someone in your life to talk to about the online abuse your seeing. Ask your parents if you can talk to someone pro

In [114]:
# 3) Blaming comments OP replied to
mask3 = (parents["has_blaming"]) & (parents["op_replied_here"])
show_examples(mask3, n=3, label="Blaming comments that OP replied to")

# 4) Blaming comments OP ignored
mask4 = (parents["has_blaming"]) & (~parents["op_replied_here"])
show_examples(mask4, n=3, label="Blaming comments that OP did NOT reply to")



===== EXAMPLES: Blaming comments that OP replied to =====

---
Subreddit: NA
Post ID: 1o2vk9c
Comment author: Cre4mPie777
Comment score: 1
within_post_idx: 1
OP replied here: True
Comment body:
 I had a boyfriend exactly like this. Eventually i got dependent and attached to the point that i couldnt leave. He ended up leaving me and i found out later he had shown intimate pictures around and accused me of making false r@pe allegations. Get out and dont look back. Get a restraining order if you need to. If he hurts or kills himself it is NOT your fault. You are not crazy and none of that is okay. I hope you find your peace and happiness lovely xx :) 


---
Subreddit: NA
Post ID: 1ouqac6
Comment author: gonetohelp
Comment score: 2
within_post_idx: 2
OP replied here: True
Comment body:
 But I want you to know that what happened was not your fault. It wasn’t. You were taken advantage of and used by a very insidious brand of abuser: The Apologizer/Acknowledger. Willing to admit they made a 