In [15]:
import os
import pandas as pd

# 📌 Lokaler Speicherort im Projekt
EXPORT_DIR = "../../data/raw"
os.makedirs(EXPORT_DIR, exist_ok=True)
EXPORT_PATH = os.path.join(EXPORT_DIR, "reddit_unlabeled_for_labeling.csv")

# 📌 Ursprüngliche Pfade
DRIVE_PATH = "G:/Meine Ablage/reddit/"
REDDIT_POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
REDDIT_COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

# 🔍 Laden mit Pipe-Trennzeichen
df_posts_raw = pd.read_csv(REDDIT_POSTS_CSV, sep="|", encoding="utf-8", on_bad_lines="skip")
df_comments_raw = pd.read_csv(REDDIT_COMMENTS_CSV, sep="|", encoding="utf-8", on_bad_lines="skip")

In [16]:
print(df_comments_raw.columns)
print(df_posts_raw.columns)

Index(['post_id', 'comment_id', 'author', 'date', 'time', 'score', 'selftext'], dtype='object')
Index(['post_id', 'crypto', 'search_term', 'subreddit', 'title', 'author',
       'date', 'time', 'score', 'num_comments', 'selftext'],
      dtype='object')


In [17]:
# 🧾 Vereinheitlichen: nur post_id + selftext + Quelle
df_comments_raw = df_comments_raw.rename(columns={"post_id": "id", "selftext": "text"})
df_comments_raw["source"] = "comment"
df_comments_raw = df_comments_raw[["id", "text", "source"]].dropna()

df_posts_raw = df_posts_raw.rename(columns={"post_id": "id", "selftext": "text"})
df_posts_raw["source"] = "post"
df_posts_raw = df_posts_raw[["id", "text", "source"]].dropna()

# 🔄 Je 200 zufällige Einträge ziehen
df_comments_sampled = df_comments_raw.sample(n=200, random_state=42)
df_posts_sampled = df_posts_raw.sample(n=200, random_state=42)

# 🧪 Zusammenführen + Label-Spalte
df_to_label = pd.concat([df_posts_sampled, df_comments_sampled], ignore_index=True)
df_to_label["label"] = ""

# 💾 Export
df_to_label.to_csv(EXPORT_PATH, index=False)

print(f"✅ Exportiert: {len(df_to_label)} Texte (200 Posts + 200 Comments) → {EXPORT_PATH}")

✅ Exportiert: 400 Texte (200 Posts + 200 Comments) → ../../data/raw\reddit_unlabeled_for_labeling.csv
