## NLP Project - Preprocessing and RoBerta

Loading Reddit Data


In [None]:
import numpy as np
import pandas as pd
import re
import json 

import matplotlib as plt
import seaborn as sns 

plt.style.use('ggplot')
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict

def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data 

def load_reddit_data(comments_directory, submissions_directory):
    with open(comments_directory, 'r', encoding='utf-8') as file:
        comment_id_pairs = []
        for i, line in enumerate(file, 1):
            try:
                if i >= 10000000:
                    break
                data = json.loads(line)
                body = data.get('body', '').strip() # Comment text
                link_id = data.get('link_id', '').strip() # Link id
                id = data.get('name', '').strip() # Specific id
                comment_id_pairs.append((body, link_id))
            except json.JSONDecodeError:
                continue  # This is done to avoid missing comments or bad lines
        comment_id_pairs = tuple(comment_id_pairs)
        comments_dict = {id_: text for text, id_ in comment_id_pairs}
        comments_dict = {id_: text for id_, text in comments_dict.items() if text not in ("[deleted]", "[removed]")} # To remove deleted or removed comments
        #print(comments_dict)

    with open(submissions_directory, 'r', encoding='utf-8') as file:
        text_id_pairs = []
        for i, line in enumerate(file, 1):
            try:
                if i >= 10000000:
                    break
                data = json.loads(line)
                title = data.get('title', '').strip() # Submission title
                body = data.get('selftext', '').strip() # Submission text
                id = data.get('name', '').strip() # Specific id
                
                score = data.get('score')
                upvote_ratio = data.get('upvote_ratio')

                
                #length of post
                combined_text = (title + " " + body).strip()
                post_len = len(combined_text)
                text_id_pairs.append((title, body, id, score, post_len, upvote_ratio))
            except json.JSONDecodeError:
                continue # This is done to avoid missing comments or bad lines
        submissions_dict = {
            id: {
                "text": (title + " " + body).strip(),
                "score": score,
                "post_len": post_len,
                "upvote_ratio": upvote_ratio
            }
            for title, body, id, score, post_len, upvote_ratio in text_id_pairs
            if (title + body).strip() not in ("[deleted]", "[removed]")
        }
    grouped = defaultdict(list)
    res = []
    for comment, id_ in comment_id_pairs:
        if id_ in submissions_dict:
            grouped[id_].append(comment)

    banned_phrases = [
        "^^^^automod",
        "welcome to /r/amitheasshole.",
        "your post has been removed",
        "#read this carefully",
        "[removed]",
        "[deleted]"
        ]

    for id_, comments in grouped.items():
        # This removes the copy of each submission because, each submission has an automated comment that starts with the string that is present in the code.
        clean_comments = [c for c in comments  if len(c.strip()) > 0 and not any(phrase in c.lower() for phrase in banned_phrases)] 
        # The following statement is to avoid appending the list with submissions that have no comments at all
        if not clean_comments:
            continue
        combined = (
            submissions_dict[id_]["text"]
            + "[======>]"
            + " ".join(clean_comments)
            + f" [==rq1==>] score={submissions_dict[id_]['score']}, length={submissions_dict[id_]['post_len']}, upvote_ratio={submissions_dict[id_]['upvote_ratio']}"

        )
        res.append(combined)

    res = [post for post in res if not any(p in post.lower() for p in banned_phrases)] # Copy of line clean_comments to ensure the data is filtered completely
    #print(res[4])

    with open("C:/Users/alexb/Desktop/Delft Minor/NLP Project/NLP Git/NLP_project/data/output.json", "w", encoding="utf-8") as f:
        json.dump(res, f, ensure_ascii=False, indent=2)


In [None]:
comments_dir = "C:/Users/alexb/Desktop/Delft Minor/NLP Project/NLP Git/NLP_project/data/amitheasshole_comments.ndjson"
submissions_dir = "C:/Users/alexb/Desktop/Delft Minor/NLP Project/NLP Git/NLP_project/data/amitheasshole_submissions.ndjson"
#reddit_data = load_reddit_data(comments_dir, submissions_dir) # Comment out after running it once

In [None]:
# Loading the [ ['title+body'] =====> [comments] ==rq1==> [upvote_ratio] [post_len] ]
with open("C:/Users/alexb/Desktop/Delft Minor/NLP Project/NLP Git/NLP_project/data/output.json", "r", encoding='utf-8') as f:
    rows = json.load(f)

#loads a submission + all of its comments as a row called 'Raw' 
df = pd.DataFrame({"raw": rows})

regex_seperate_post_comments = r"\s*\[======>\]\s*"
regex_seperate_rq1_values = r"\s*\[==rq1==>\]\s*"
#Creating Dictionary by splitting raw into post, comments columns
df.insert(0, "id", range(1, len(df) + 1))

#splitting into post + comment&data
df[["post", "comments"]] = df["raw"].str.split(regex_seperate_post_comments, n=1, regex=True, expand=True)

#seperating rq1 info from comments
df[["comments", "rq1 data"]] = df["comments"].str.split(regex_seperate_rq1_values, n=1, regex=True, expand=True)
df["rq1 data"] = df["rq1 data"].fillna("")

df["score"] = pd.to_numeric(
    df["rq1 data"].str.extract(r"score=\s*(-?\d+(?:\.\d+)?)")[0],
    errors="coerce"
)

df["post_len"] = (
    pd.to_numeric(
        df["rq1 data"].str.extract(r"length=\s*(\d+)")[0],
        errors="coerce"
    )
    .div(100)
    .round()
    .mul(100)
)

df["upvote_ratio"] = pd.to_numeric(
    df["rq1 data"].str.extract(r"upvote_ratio=([0-9]*\.?[0-9]+)")[0],
    errors="coerce"
)

df["comments"] = df["comments"].fillna("")

#Splitting all comments into a comment list
df["comments_list"] = df["comments"].apply(lambda x: [c.strip() for c in re.split(r"\n\s*\n", x.strip()) if c.strip()])

In [None]:
df.head(10)

In [None]:
# Extracting the verdict of the Comment 
asshole_spellings = r"a[\s\-$]*s?[\s\-$]*s?[\s\-$]*h?[\s\-$]*o+[\s\-$]*(?:l+[\s\-$]*e*|e+[\s\-$]*l+)"

# --- YTA ---
regex_spellings_YTA = re.compile(
    rf"\b(?:YTA|you\s*(?:'re|are|re)?\s*(?:an?\s*)?(?:the\s*)?{asshole_spellings}|yes\s+the\s+{asshole_spellings})\b",
    re.IGNORECASE
)

# --- NTA ---
regex_spellings_NTA = re.compile(
    rf"\b(?:NTA|you\s*(?:'re|are|re)?\s*not\s*(?:an?\s*)?(?:the\s*)?{asshole_spellings}|not\s+the\s+{asshole_spellings})\b",
    re.IGNORECASE
)

# --- ESH ---
regex_spelling_esh = re.compile(
    r"\b(?:ESH|everyone\s+sucks\s+here)\b",
    re.IGNORECASE
)

# --- NAH ---
regex_spelling_nah = re.compile(
    rf"\b(?:NAH|no\s+(?:{asshole_spellings}s?|a[\s\-$]*-?holes?)\s+here)\b",
    re.IGNORECASE
)

def extract_verdict(comment):
    text = comment.lower()
    if regex_spellings_YTA.search(text) or regex_spelling_esh.search(text):
        return "YTA"
    if regex_spellings_NTA.search(text) or regex_spelling_nah.search(text):
        return "NTA"  
    else:
        return None


def summarize_verdicts(comment_list):
    counts = {"YTA":0, "NTA":0}
    comment_list = comment_list or []
    n_comments = len(comment_list)

    for comment in (comment_list or []):
        v = extract_verdict(comment)
        if v in counts:
            counts[v] += 1

    n_verdicts = sum(counts.values())
    if n_verdicts == 0:
        return None, 0, 0, counts
    majority = max(counts, key=counts.get)
    polarization = abs(counts["YTA"] - counts["NTA"]) / n_verdicts
    return majority, polarization, n_comments, n_verdicts, counts

df[["majority", "polarization", "num_comments", "num_verdicts", "verdict_counts"]] = (
    df["comments_list"].apply(lambda lst: pd.Series(summarize_verdicts(lst)))
)
sorted_list_num_comments = df.sort_values(by="num_comments", ascending=False).reset_index(drop=True)


In [None]:
sorted_list_num_comments.head(10)

In [None]:

from nltk.sentiment import SentimentIntensityAnalyzer 
from tqdm.notebook import tqdm # tracks loop progress through a bar

sia = SentimentIntensityAnalyzer() #initalize and call it 

sia #object is created

# run loop for scores
results = {}
for i, row in tqdm(sorted_list_num_comments.iterrows(), total=len(sorted_list_num_comments)):
    text = row['post']
    myid = row['id']
    results[myid] = sia.polarity_scores(text)

vader_results_posts = pd.DataFrame(results).T # T transposes the dictionary
vader_results_posts = vader_results_posts.reset_index().rename(columns={'index': 'id'})
vader_results_posts_soretd_num_comments = vader_results_posts.merge(sorted_list_num_comments, how='left')
vader_results_posts_soretd_num_comments

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# --- 1) Take a 20% sample (stratified-ish by random state) ---
test_df = sorted_list_num_comments.sample(frac=0.20, random_state=42).reset_index(drop=True)

# --- 2) Prepare data from the sample only ---
ids   = test_df["id"].tolist()
texts = test_df["post"].astype(str).tolist()

batch_size = 64
max_length = 256

MODEL = "cardiffnlp/twitter-roberta-base-sentiment" 
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

all_ids, all_probs = [], []

for i in tqdm(range(0, len(texts), batch_size), desc="RoBERTa scoring (20% sample, batched)"):
    batch_texts = texts[i:i+batch_size]
    enc = tokenizer(
        batch_texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits

    probs = softmax(logits.cpu().numpy(), axis=1)

    neg_prob = probs[:, 0]   # negative
    neu_prob = probs[:, 1]   # neutral
    pos_prob = probs[:, 2]   # positive

    compound = pos_prob - neg_prob 

    all_ids.extend(ids[i:i+batch_size])
    all_probs.extend(hate_prob.tolist())

# --- 4) Build results for the sample and merge (like VADER flow) ---
roberta_results_posts_sorted_num_comments = pd.DataFrame({"id": all_ids, "sent_compound": all_probs})
roberta_results_posts_sorted_num_comments = sorted_list_num_comments.merge(
    test_df, how="left", on="id"
)

roberta_results_posts_sorted_num_comments.head()



In [None]:
roberta_results_posts_sorted_num_comments["verdict_binary"] = roberta_results_posts_sorted_num_comments["majority"].map({"YTA": 1, "NTA": 0})

In [None]:
roberta_results_posts_sorted_num_comments.loc[:,["score","post_len","upvote_ratio","verdict_binary"]] 
df_clean_roberta = roberta_results_posts_sorted_num_comments.dropna(subset=["verdict_binary"]).copy()
df_clean_roberta.head()

# Data Explorationa and Plots

## Exploration

In [None]:
vader_results_posts_soretd_num_comments["verdict_binary"] = vader_results_posts_soretd_num_comments["majority"].map({"YTA": 1, "NTA": 0})

In [None]:
vader_results_posts_soretd_num_comments.loc[:,["score","post_len","upvote_ratio","verdict_binary"]] 
df_clean_vader = vader_results_posts_soretd_num_comments.dropna(subset=["verdict_binary"]).copy()
df_clean_vader.head()

### Logistic Regression on post length vs YTA verdict

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix


features = ["post_len", "score", "polarization", "num_comments"]
X = df_clean_vader[features]
y = df_clean_vader["verdict_binary"]
log_reg_pipeline.fit(X, y)

# Range of post lengths
post_len_range = np.linspace(df_clean_vader["post_len"].min(), df_clean_vader["post_len"].max(), 500)

# Build prediction DataFrame (keep others at mean)
X_pred = pd.DataFrame({
    "post_len": post_len_range,
    "score": df_clean_vader["score"].mean(),
    "polarization": df_clean_vader["polarization"].mean(),
    "num_comments": df_clean_vader["num_comments"].mean(),
   
})

# Predict probability of YTA
y_pred_prob = log_reg_pipeline.predict_proba(X_pred)[:, 1]

# Plot
plt.figure(figsize=(8,5))
plt.plot(post_len_range, y_pred_prob, color="tomato")
plt.xlabel("Post Length")
plt.ylabel("Predicted Probability (YTA)")
plt.title("Predicted Probability of YTA vs Post Length")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()



coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_reg_pipeline.named_steps["model"].coef_[0]
}).sort_values("Coefficient", ascending=False)

plt.figure(figsize=(7,4))
sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
plt.title("Feature Influence on YTA vs NTA (Logistic Regression)")
plt.axvline(0, color="black", linestyle="--")
plt.show()



y_pred = log_reg_pipeline.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y, y_pred))




cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Reds",
            xticklabels=["Pred NTA", "Pred YTA"],
            yticklabels=["True NTA", "True YTA"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix: Logistic Regression (YTA vs NTA)")
plt.show()


## LR with Vader


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# ---- take 20% sample for fair comparison ----
dfv = df_clean_vader.sample(frac=0.20, random_state=42).reset_index(drop=True)

features = ["post_len","compound" ,"score", "polarization", "num_comments"]
# ensure numeric & drop missing rows for modeling
dfv[features + ["verdict_binary"]] = dfv[features + ["verdict_binary"]].apply(pd.to_numeric, errors="coerce")
dfv = dfv.dropna(subset=features + ["verdict_binary"])

X = dfv[features]
y = dfv["verdict_binary"]

# pipeline
log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])
log_reg_pipeline.fit(X, y)

# ---- partial dependence: vary post_len, hold others at mean ----
post_len_range = np.linspace(dfv["post_len"].min(), dfv["post_len"].max(), 500)
means = dfv[features].mean(numeric_only=True)

X_pred = pd.DataFrame({
    "post_len": post_len_range,
    "score": np.full_like(post_len_range, means["score"], dtype=float),
    "compound": np.full_like(post_len_range, means["compound"],dtype=float),
    "polarization": np.full_like(post_len_range, means["polarization"], dtype=float),
    "num_comments": np.full_like(post_len_range, means["num_comments"], dtype=float),
})[features]

# Predict probability of YTA
y_pred_prob = log_reg_pipeline.predict_proba(X_pred)[:, 1]

# Plot P(YTA) vs post_len
plt.figure(figsize=(8,5))
plt.plot(post_len_range, y_pred_prob, color="tomato")
plt.xlabel("Post Length"); plt.ylabel("Predicted Probability (YTA)")
plt.title("Predicted Probability of YTA vs Post Length (VADER)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

# Coefficients
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_reg_pipeline.named_steps["model"].coef_[0]
}).sort_values("Coefficient", ascending=False)

plt.figure(figsize=(7,4))
sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
plt.title("Feature Influence on YTA vs NTA (VADER)")
plt.axvline(0, color="black", linestyle="--")
plt.show()

# Metrics on the 20% sample
y_pred = log_reg_pipeline.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))
cm = confusion_matrix(y, y_pred)
print("\nConfusion matri Vader:\n", cm)

sns.heatmap(cm, annot=True, fmt="d", cmap="Reds",
            xticklabels=["Pred NTA", "Pred YTA"],
            yticklabels=["True NTA", "True YTA"])
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title("Confusion Matrix: Logistic Regression Vader")
plt.show()


### LR with Roberta
- using compound and upvote to try improve regression

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

# --- features & data ---
features = ["post_len", "score", "upvote_ratio", "polarization", "num_comments", "hate_prob"]

dfm = df_clean_roberta.copy()
dfm[features] = dfm[features].apply(pd.to_numeric, errors="coerce")
dfm = dfm.dropna(subset=features + ["verdict_binary"]).reset_index(drop=True)

X = dfm[features]
y = dfm["verdict_binary"]

# --- pipeline ---
log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])
log_reg_pipeline.fit(X, y)

# --- partial dependence: vary post_len, hold others at mean ---
post_len_range = np.linspace(dfm["post_len"].min(), dfm["post_len"].max(), 500)
means = dfm[features].mean(numeric_only=True)

X_pred = pd.DataFrame({
    "post_len": post_len_range,
    "score": np.full_like(post_len_range, means["score"], dtype=float),
    "upvote_ratio": np.full_like(post_len_range, means["upvote_ratio"], dtype=float),
    "polarization": np.full_like(post_len_range, means["polarization"], dtype=float),
    "num_comments": np.full_like(post_len_range, means["num_comments"], dtype=float),
    "hate_prob": np.full_like(post_len_range, means["hate_prob"], dtype=float),
})[features]  # enforce exact column order

# --- plot P(YTA) vs post_len ---
y_pred_prob = log_reg_pipeline.predict_proba(X_pred)[:, 1]
plt.figure(figsize=(8,5))
plt.plot(post_len_range, y_pred_prob, color="tomato")
plt.xlabel("Post Length")
plt.ylabel("Predicted Probability (YTA)")
plt.title("Predicted Probability of YTA vs Post Length (RoBerta)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

# --- coefficients ---
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_reg_pipeline.named_steps["model"].coef_[0]
}).sort_values("Coefficient", ascending=False)

plt.figure(figsize=(7,4))
sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
plt.title("Feature Influence on YTA vs NTA (RoBerta)")
plt.axvline(0, color="black", linestyle="--")
plt.show()

# --- in-sample metrics ---
y_pred = log_reg_pipeline.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))
cm = confusion_matrix(y, y_pred)
print("\nConfusion matrix:\n", cm)

sns.heatmap(cm, annot=True, fmt="d", cmap="Reds",
            xticklabels=["Pred NTA", "Pred YTA"],
            yticklabels=["True NTA", "True YTA"])
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title("Confusion Matrix: Logistic Regression (RoBerta)")
plt.show()


In [None]:
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_reg_pipeline.named_steps["model"].coef_[0]
}).sort_values("Coefficient", ascending=False)

plt.figure(figsize=(7,4))
sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
plt.title("Feature Influence on YTA vs NTA (Logistic Regression)")
plt.axvline(0, color="black", linestyle="--")
plt.show()

## Plot 

Post Length vs Upvote Ratio

In [None]:
plt.figure(figsize=(7,4))
sns.kdeplot(data=dfv, x="polarization", hue="verdict_binary", fill=True, common_norm=False)
plt.title("Polarity (Comment Sentiment) by Verdict")
plt.xlabel("Polarization Score")
plt.xticks()
plt.show()





Post Length vs Score 

In [None]:

import matplotlib_inline
import matplotlib.pyplot as plt
sns.scatterplot(x="post_len", y="score", data=df, alpha=0.3, color="tomato")
plt.xlabel("Score")
plt.ylabel("Post Length")
plt.title("Post Length vs Score (log scale)")
plt.show()

In [None]:
plt.figure(figsize=(7,4))
sns.histplot(dfv["polarization"], kde=True, bins=40, color="purple")
plt.title("Distribution of Comment Polarity (Polarization)")
plt.xlabel("Polarization Score")
plt.ylabel("Frequency")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()

In [None]:
sns.lineplot(x="post_len", y="verdict_binary", data=vader_results_posts_soretd_num_comments, alpha=0.3)

In [None]:
# MODEL A: with num_comments 2
features_a = ["post_len", "score", "upvote_ratio", "polarization", "num_comments", "hate_prob"]

# MODEL B: without num_comments (content only)
features_b = ["post_len", "score", "upvote_ratio", "polarization", "hate_prob"]

def train_and_plot(features, title_suffix):
    X = dfm[features]
    y = dfm["verdict_binary"]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X, y)

    coef_df = pd.DataFrame({
        "Feature": features,
        "Coefficient": pipe.named_steps["model"].coef_[0]
    }).sort_values("Coefficient", ascending=False)

    plt.figure(figsize=(7,4))
    sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
    plt.title(f"Logistic Regression Coefficients {title_suffix}")
    plt.axvline(0, color='black', linestyle='--')
    plt.show()

train_and_plot(features_a, "(Including num_comments)")
train_and_plot(features_b, "(No num_comments / content only)")


In [None]:
key = "id"

cols_v = ["id","post_len","score","upvote_ratio","polarization","num_comments",
          "compound","verdict_binary"]
cols_m = ["id","hate_prob"]

dfc = (dfv[cols_v]
       .merge(dfm[cols_m], on=key, how="inner")
       .copy())


features_a = ["post_len","score","upvote_ratio","polarization","num_comments","compound","hate_prob"]
features_b = ["post_len","score","upvote_ratio","polarization","compound","hate_prob"]

dfc[features_a + ["verdict_binary"]] = dfc[features_a + ["verdict_binary"]].apply(
    pd.to_numeric, errors="coerce"
)
dfc = dfc.dropna(subset=features_a + ["verdict_binary"])

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def train_and_plot(df, features, title_suffix):
    X = df[features]
    y = df["verdict_binary"]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]).fit(X, y)

    coef_df = (pd.DataFrame({
        "Feature": features,
        "Coefficient": pipe.named_steps["model"].coef_[0]
    }).sort_values("Coefficient", ascending=False))

    plt.figure(figsize=(7,4))
    sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
    plt.title(f"Logistic Regression Coefficients {title_suffix}")
    plt.axvline(0, color='black', linestyle='--')
    plt.show()

train_and_plot(dfc, features_a, "(Including num_comments)")
train_and_plot(dfc, features_b, "(Content-only: No comments)")



In [None]:
# ---- Feature label mapping (nice names for plotting) ----
nice_names = {
    "post_len": "Post Length (chars)",
    "score": "Score (Upvotes)",
    "upvote_ratio": "Upvote Ratio",
    "polarization": "Comment Polarity",
    "compound": "Sentiment (Compound)",
    "num_comments": "Number of Comments",
    "hate_prob": "Hate Speech Probability"
}

def train_and_plot(df, features, title_suffix):
    X = df[features]
    y = df["verdict_binary"]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]).fit(X, y)

    # Map features to nicer names here
    coef_df = pd.DataFrame({
        "Feature": [nice_names.get(f, f) for f in features],
        "Coefficient": pipe.named_steps["model"].coef_[0]
    }).sort_values("Coefficient", ascending=False)

    plt.figure(figsize=(7,4))
    sns.barplot(x="Coefficient", y="Feature", data=coef_df, palette="coolwarm")
    plt.title(f"Logistic Regression Coefficients {title_suffix}")
    plt.axvline(0, color='black', linestyle='--')

    plt.xlabel("Coefficient")
    plt.ylabel("Feature")
    plt.show()


In [None]:
train_and_plot(dfc, features_a, "(Including num_comments)")
train_and_plot(dfc, features_b, "(Content-only: No comments)")
