### Load data

In [None]:
import os
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime
import re
import nltk

# Download VADER lexicon if not already installed
nltk.download("vader_lexicon")


In [None]:
# Paths
INTERIM_DIR = "data/interim"
PREPROCESSED_FILE = os.path.join(INTERIM_DIR, "reddit_preprocessed.csv")
WRANGLED_FILE = os.path.join(INTERIM_DIR, "reddit_wrangled.csv")

# Load preprocessed data
df = pd.read_csv(PREPROCESSED_FILE, parse_dates=["created"])
print(f"Loaded preprocessed data: {df.shape[0]} rows, {df.shape[1]} columns")


### Temporal Features

In [None]:
df["year"] = df["created"].dt.year
df["month"] = df["created"].dt.month
df["day"] = df["created"].dt.day
df["weekday"] = df["created"].dt.weekday
df["hour"] = df["created"].dt.hour

### Text Features

In [None]:
# Ensure text column is string
df["text"] = df["text"].fillna("").astype(str)

df["word_count"] = df["text"].str.split().apply(len)
df["char_count"] = df["text"].str.len()
df["avg_word_len"] = df["char_count"] / df["word_count"].replace(0, 1)
df["has_url"] = df["text"].str.contains(r"http", regex=True).astype(int)

### Engagement Features

# Ensure text column is string
df["text"] = df["text"].fillna("").astype(str)

df["word_count"] = df["text"].str.split().apply(len)
df["char_count"] = df["text"].str.len()
df["avg_word_len"] = df["char_count"] / df["word_count"].replace(0, 1)
df["has_url"] = df["text"].str.contains(r"http", regex=True).astype(int)

### Author and Subreddit Stats

In [None]:
# Posts per author
author_post_count = (
    df[df["source"] == "post"].groupby("author").size().rename("author_post_count")
)
df = df.merge(author_post_count, on="author", how="left")

# Comments per author
author_comment_count = (
    df[df["source"] == "comment"]
    .groupby("author")
    .size()
    .rename("author_comment_count")
)
df = df.merge(author_comment_count, on="author", how="left")

# Posts per subreddit
subreddit_post_count = (
    df[df["source"] == "post"]
    .groupby("subreddit")
    .size()
    .rename("subreddit_post_count")
)
df = df.merge(subreddit_post_count, on="subreddit", how="left")


### Domain / URL Features

In [None]:
df["main_domain"] = df["url"].str.extract(r"https?://([^/]+)/")[0]

### Sentiment Features

In [None]:
sia = SentimentIntensityAnalyzer()
df["sentiment_compound"] = df["text"].apply(
    lambda x: sia.polarity_scores(str(x))["compound"]
)
df["sentiment_label"] = df["sentiment_compound"].apply(
    lambda x: "positive" if x > 0 else ("negative" if x < 0 else "neutral")
)

### Optional Keyword / Topic Placeholders

df["topic"] = np.nan  # can fill later using BERTopic or LDA
df["stance"] = np.nan  # optional stance labeling for misinformation analysis


### Save Wrangled Dataset

In [None]:
df.to_csv(WRANGLED_FILE, index=False)
print(f"Wrangled dataset saved: {WRANGLED_FILE}")
print(f"Final shape: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()
