In [31]:
import os
import pandas as pd
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI
from praw.exceptions import APIException
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer


# 📌 Pfade zu den CSV-Dateien
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

# 🔹 **Funktion zum Laden der CSV-Dateien**
def load_csv(filepath):
    """Lädt eine CSV-Datei mit `|` als Trennzeichen und Debugging-Infos"""
    if not os.path.exists(filepath):
        print(f"❌ Datei nicht gefunden: {filepath}")
        return pd.DataFrame()

    df = pd.read_csv(filepath, sep="|", encoding="utf-8-sig", on_bad_lines="skip")

    print(f"\n📌 Datei geladen: {filepath}")
    print(f"🔹 Spalten: {df.columns.tolist()}")
    print(df.dtypes)
    print(df.head())

    return df

# 📌 **Daten laden**
df_posts = load_csv(POSTS_CSV)
df_comments = load_csv(COMMENTS_CSV)



📌 Datei geladen: G:/Meine Ablage/reddit/reddit_posts_original.csv
🔹 Spalten: ['post_id', 'crypto', 'search_term', 'subreddit', 'title', 'author', 'date', 'time', 'score', 'num_comments', 'selftext']
post_id         object
crypto          object
search_term     object
subreddit       object
title           object
author          object
date            object
time            object
score            int64
num_comments     int64
selftext        object
dtype: object
   post_id   crypto search_term       subreddit  \
0  1j0dz4r  Bitcoin     bitcoin  CryptoCurrency   
1  1j0dghm  Bitcoin     bitcoin  CryptoCurrency   
2  1j0cx0g  Bitcoin     bitcoin  CryptoCurrency   
3  1j0cnlh  Bitcoin     bitcoin  CryptoCurrency   
4  1j08i0y  Bitcoin     bitcoin  CryptoCurrency   

                                               title               author  \
0  BlackRock Adds Its Bitcoin ETF to Model Portfo...             diwalost   
1  x-post: As the free-float of coins is low, is ...        3fkgf9fmd980

In [32]:
def clean_data(df_posts, df_comments, comment_threshold=500, min_length=5):
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])
    
    # 2. Fehlende Werte behandeln
    df_posts['selftext'] = df_posts['selftext'].fillna('')  # Fehlende Posttexte auffüllen
    df_comments['selftext'] = df_comments['selftext'].fillna('')  # Fehlende Kommentare auffüllen

    # 3. Entferne Nutzer (Bots) mit übermäßigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index  # Nutzer über Grenze
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    # 4. Entferne zu kurze Kommentare
    df_comments = df_comments[df_comments['selftext'].str.len() >= min_length]

    print(f"✅ Daten bereinigt: {df_comments.shape[0]} Kommentare übrig (nach Spam-Filter & Länge > {min_length}).")

    return df_posts, df_comments


In [33]:
# Bereinigen der Daten
df_posts_clean, df_comments_clean = clean_data(df_posts, df_comments, comment_threshold=500, min_length=5)


# Überprüfen, wie viele Einträge übrig sind
print(f"Bereinigte Posts: {len(df_posts_clean)}")
print(f"Bereinigte Kommentare: {len(df_comments_clean)}")

✅ Daten bereinigt: 312146 Kommentare übrig (nach Spam-Filter & Länge > 5).
Bereinigte Posts: 9175
Bereinigte Kommentare: 312146


In [34]:
# GPU nutzen, falls verfügbar sonst weglassen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results

# 🔹 Sentiment für **Posts** berechnen
tqdm.pandas()  # Fortschrittsanzeige aktivieren
df_posts_clean["full_text"] = df_posts_clean["title"] + " " + df_posts_clean["selftext"].fillna("")
df_posts_clean["full_text"] = df_posts_clean["full_text"].str.strip()

# Spalten 'title' und 'selftext' droppen
df_posts_clean.drop(columns=["title", "selftext"], inplace=True)

df_posts_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts_clean["full_text"].tolist()), index=df_posts_clean.index
)

# 🔹 Sentiment für **Kommentare** berechnen
df_comments_clean["full_text"] = df_comments_clean["selftext"].fillna("")

# Spalte 'selftext' droppen
df_comments_clean.drop(columns=["selftext"], inplace=True)

df_comments_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments_clean["full_text"].tolist()), index=df_comments_clean.index
)

# 🔹 Ergebnisse anzeigen
print(f"✅ Sentiment-Analyse abgeschlossen: {len(df_posts_clean)} Posts & {len(df_comments_clean)} Kommentare bewertet.")


🚀 Verwende Gerät: cuda


🔍 Analysiere Sentiments: 100%|██████████| 287/287 [02:41<00:00,  1.77it/s]
🔍 Analysiere Sentiments: 100%|██████████| 9755/9755 [36:15<00:00,  4.48it/s]  

✅ Sentiment-Analyse abgeschlossen: 9175 Posts & 312146 Kommentare bewertet.





In [35]:
posts = df_posts_clean.copy()
comments = df_comments_clean.copy()

In [36]:
# 🔹 **Relevante Spalten für den Merge**
posts = posts[["post_id", "crypto", "search_term", "subreddit", "author", "date", "time", "score", "full_text", "sentiment", "sentiment_confidence"]]
comments = comments[["post_id", "comment_id", "author", "date", "time", "score", "full_text", "sentiment", "sentiment_confidence"]]

# 🔹 **Kommentare erben `crypto`, `search_term` und `subreddit` vom Post**
comments = comments.merge(df_posts[["post_id", "crypto", "search_term", "subreddit"]], on="post_id", how="left")

# 🔹 `type`-Spalte für Unterscheidung hinzufügen
posts["comment_id"] = None  # Posts haben keine comment_id
posts["type"] = "post"
comments["type"] = "comment"

# 🔹 **Gemeinsame Spalten für den Merge**
common_columns = [
    "post_id", "comment_id", "type", "date", "time", "crypto",
    "search_term", "subreddit", "author","full_text","score", "sentiment", "sentiment_confidence",
]

# 🔹 **Merging der Daten (Posts + Kommentare)**
df_merged = pd.concat([posts[common_columns], comments[common_columns]], ignore_index=True)

# 🔍 Debugging: Überprüfung der Größe
print(f"📌 Merged Dataset: {df_merged.shape[0]} Einträge (Posts + Kommentare)")

# 🔹 Überprüfen, ob alles korrekt normalisiert wurde
print(df_merged.head())

📌 Merged Dataset: 321321 Einträge (Posts + Kommentare)
   post_id comment_id  type        date      time   crypto search_term  \
0  1j0dz4r       None  post  2025-02-28  18:01:02  Bitcoin     bitcoin   
1  1j0dghm       None  post  2025-02-28  17:39:22  Bitcoin     bitcoin   
2  1j0cx0g       None  post  2025-02-28  17:16:50  Bitcoin     bitcoin   
3  1j0cnlh       None  post  2025-02-28  17:06:18  Bitcoin     bitcoin   
4  1j08i0y       None  post  2025-02-28  14:09:15  Bitcoin     bitcoin   

        subreddit               author  \
0  CryptoCurrency             diwalost   
1  CryptoCurrency        3fkgf9fmd980e   
2  CryptoCurrency            kirtash93   
3  CryptoCurrency                KIG45   
4  CryptoCurrency  rizzobitcoinhistory   

                                           full_text  score sentiment  \
0  BlackRock Adds Its Bitcoin ETF to Model Portfo...     13   neutral   
1  x-post: As the free-float of coins is low, is ...      0   bullish   
2  Companies Building on the

In [37]:
df_comments_clean.head()

Unnamed: 0,post_id,comment_id,author,date,time,score,full_text,sentiment,sentiment_confidence
2,1j0dz4r,mfb16ty,DonasAskan,2025-02-28,19:43:55,1,How does this only have 3 upvotes lol?,neutral,0.816071
3,1j0dz4r,mfakfdl,Deeujian,2025-02-28,18:26:05,-2,1-2% is nothing for Blackrock but they decided...,neutral,0.578827
4,1j0dz4r,mfag4zu,diwalost,2025-02-28,18:05:51,1,"They are not, retail is",neutral,0.691663
5,1j0dz4r,mfbrf7n,Bear-Bull-Pig,2025-02-28,21:46:12,1,People don't like Blackrock,neutral,0.493956
6,1j0dz4r,mfao0qa,diwalost,2025-02-28,18:42:51,1,They filled for ETF when we were in bear marke...,neutral,0.619266


In [38]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")
ORIGINAL_POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
ORIGINAL_COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

In [39]:
def append_to_csv(df_new, filename, key_column):
    """Hängt neue Daten an eine bestehende CSV an & entfernt Duplikate."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # Falls Datei existiert, alte Daten einlesen
        if os.path.exists(file_path):
            df_existing = pd.read_csv(file_path, sep="|", encoding="utf-8-sig", on_bad_lines="skip")
            
            # 🔹 Daten zusammenführen & Duplikate nach `key_column` entfernen (neuere Werte behalten)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=[key_column], keep="last")
        else:
            df_combined = df_new  # Falls keine Datei existiert, neue Daten direkt nutzen

        # 🔹 CSV speichern
        df_combined.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich aktualisiert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_to_drive(df_posts_clean, df_comments_clean, df_merged,df_posts, df_comments):
    """Speichert Posts, Kommentare & die gemergte Datei mit Duplikat-Prüfung."""
    try:
        append_to_csv(df_posts_clean, "reddit_posts.csv", key_column="post_id")
        append_to_csv(df_comments_clean, "reddit_comments.csv", key_column="comment_id")
        append_to_csv(df_merged, "reddit_merged.csv", key_column="comment_id")  # Falls Kommentare entscheidend sind
        append_to_csv(df_posts, "reddit_posts_original.csv", key_column="post_id")
        append_to_csv(df_comments, "reddit_comments_original.csv", key_column="comment_id")
    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [None]:
# Export-Funktion aufrufen
export_to_drive(df_posts_clean, df_comments_clean, df_merged,df_posts, df_comments)

✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_posts.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_comments.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_merged.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_posts_original.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_comments_original.csv


: 