# Reddit Scraper für vergangene Daten 

## Import 

In [57]:
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI

In [58]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders heißt, anpassen
# Prüfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [59]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [60]:
for post in reddit.subreddit("CryptoCurrency").search("Bitcoin", sort="new", limit=5):
    print(post.title, post.created_utc)


Banking Giant Barclays Owns $136,834,631 Worth of BlackRock’s Bitcoin Exchange-Traded Fund 1739645324.0
Abu Dhabi’s Sovereign Fund Invests $436 Million in BlackRock’s Bitcoin ETF 1739642215.0
Bitcoin’s Late Longs Liquidated: Is a Breakout Finally on the Horizon? 1739638521.0
Tipping system like Ko-fi or Buy Me a Coffee but with crypto ? 1739638493.0
Bitcoin is the boomer generation of crypto currency 1739637346.0


Cryptos und Subreddits 

In [61]:
crypto_terms = {
    # 🔹 Top Coins
    "Ethereum": ["ethereum", "eth", "ether", "ethereum 2.0", "eth 2.0"],
    "Wrapped Ethereum": ["wrapped ethereum", "weth"],
    "Solana": ["solana", "sol", "sol coin"],
    "Avalanche": ["avalanche", "avax"],
    "Polkadot": ["polkadot", "dot"],
    "Near Protocol": ["near protocol", "near"],
    "Polygon": ["polygon", "matic"],
    "XRP": ["xrp", "ripple"],
    "Cardano": ["cardano", "ada"],
    "Cronos": ["cronos", "cro"],
    "Vulcan Forged PYR": ["vulcan forged", "pyr"],
    "Chiliz": ["chiliz", "chz"],
    "Illuvium": ["illuvium", "ilv"],
    "Ronin": ["ronin", "ron"],
    "Band Protocol": ["band protocol", "band"],
    "Optimism": ["optimism", "op"],
    "Celestia": ["celestia", "tia"],
    "Numerai": ["numerai", "nmr"],
    "Aethir": ["aethir", "ath"],
    "Sui": ["sui"],
    "Hyperliquid": ["hyperliquid", "hyp"],
    "Robinhood Coin": ["robinhood", "hood"],
    "Trump Coin": ["trump coin"],
    "USD Coin": ["usd coin", "usdc"],
    "Binance Coin": ["binance", "bnb"],
    "Litecoin": ["litecoin", "ltc"],
    "Dogecoin": ["dogecoin", "doge"],
    "Tron": ["tron", "trx"],
    "Aave": ["aave"],
    "Hedera": ["hedera", "hbar"],
    "Filecoin": ["filecoin", "fil"],
    "Cosmos": ["cosmos", "atom"],
    "Gala": ["gala"],
    "The Sandbox": ["sandbox", "sand"],
    "Audius": ["audius", "audio"],
    "Render": ["render", "rndr"],
    "Kusama": ["kusama", "ksm"],
    "VeChain": ["vechain", "vet"],
    "Chainlink": ["chainlink", "link"],
    "Berachain": ["berachain", "bera"],
    "TestCoin": ["testcoin", "test"],

    # 🔹 Meme-Coins
    "Shiba Inu": ["shiba inu", "shib"],
    "Pepe": ["pepe"],
    "Floki Inu": ["floki inu", "floki"],
    "Bonk": ["bonk"],
    "Wojak": ["wojak"],
    "Mog Coin": ["mog"],
    "Doge Killer (Leash)": ["leash"],
    "Baby Doge Coin": ["baby doge", "babydoge"],
    "Degen": ["degen"],
    "Toshi": ["toshi"],
    "Fartcoin": ["fartcoin"],
    "Banana": ["banana"],
    "Kabosu": ["kabosu"],
    "Husky": ["husky"],
    "Samoyedcoin": ["samoyedcoin", "samo"],
    "Milkbag": ["milkbag"],

    # 🔹 New Coins
    "Arbitrum": ["arbitrum", "arb"],
    "Starknet": ["starknet", "strk"],
    "Injective Protocol": ["injective", "inj"],
    "Sei Network": ["sei"],
    "Aptos": ["aptos", "apt"],
    "EigenLayer": ["eigenlayer", "eigen"],
    "Mantle": ["mantle", "mnt"],
    "Immutable X": ["immutable x", "imx"],
    "Ondo Finance": ["ondo"],
    "Worldcoin": ["worldcoin", "wld"],
    "Aerodrome": ["aerodrome", "aero"],
    "Jupiter": ["jupiter", "jup"],
    "THORChain": ["thorchain", "rune"],
    "Pendle": ["pendle"],
    "Kujira": ["kujira", "kuji"],
    "Noble": ["noble"],
    "Stride": ["stride", "strd"],
    "Dymension": ["dymension", "dym"],
    "Seamless Protocol": ["seamless", "seam"],
    "Blast": ["blast"],
    "Merlin": ["merlin"],
    "Tapioca": ["tapioca"],
    "Arcadia Finance": ["arcadia"],
    "Notcoin": ["notcoin", "not"],
    "Omni Network": ["omni"],
    "LayerZero": ["layerzero", "lz"],
    "ZetaChain": ["zetachain", "zeta"],
    "Friend.tech": ["friendtech"]
}


In [62]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen über Kryptowährungen
    "CryptoMarkets",   # Diskussionen über den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen
    "Altcoin",         # Diskussionen über Altcoins (alle Kryptowährungen außer Bitcoin)
    "DeFi",            # Decentralized Finance (DeFi) und Projekte
    "BitcoinBeginners",# Für Anfänger in der Krypto-Welt
    "cryptotechnology", # Fokus auf die zugrunde liegende Blockchain-Technologie
    "cryptocurrencies", # Allgemeine Diskussionen über Kryptowährungen
    "Satoshistreetsbets", # Krypto-Wetten und Spekulationen
    "Binance",        # Diskussionen über die Binance-Plattform  
    "Bitcoin",
    "ethtrader"
]

## Scraping 

Scraping Funktionen

In [63]:
# 🔹 Scraper-Funktion
def scrape_reddit(start_date, end_date):
    """ Scraped Posts und Kommentare mit `crypto_terms`-Suche in definierten Subreddits """
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())

    posts = []
    comments = []
    post_ids = set()
    
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        print(f"\n🔎 Scraping r/{subreddit_name}...")

        for crypto, search_terms in crypto_terms.items():
            for term in search_terms:
                print(f"🔍 Suche nach: {term} in r/{subreddit_name}...")

                try:
                    results = subreddit.search(query=term, sort="new", time_filter="all", limit=1000)

                    for post in results:
                        if start_timestamp <= post.created_utc <= end_timestamp and post.id not in post_ids:
                            post_ids.add(post.id)
                            posts.append({
                                "crypto": crypto,
                                "search_term": term,
                                "subreddit": subreddit_name,
                                "post_id": post.id,
                                "title": post.title.strip(),
                                "selftext": post.selftext.strip() if post.selftext else "",
                                "author": str(post.author),
                                "created_utc": datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                "score": post.score,
                                "num_comments": post.num_comments
                            })

                            print(f"✅ Post gefunden: {post.title} (Suchbegriff: {term})")

                            # 🔹 Kommentare sammeln
                            try:
                                post.comments.replace_more(limit=5)  # Begrenzung, um viele Kommentare effizient zu scrapen
                                for comment in post.comments.list():
                                    comments.append({
                                        "post_id": post.id,
                                        "comment_id": comment.id,
                                        "author": str(comment.author),
                                        "created_utc": datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                        "score": comment.score,
                                        "selftext": comment.body.strip() if comment.body else ""
                                    })

                            except Exception as e:
                                print(f"⚠️ Fehler beim Abrufen von Kommentaren: {e}")
                                time.sleep(10)  # Wartezeit, um API-Limits zu respektieren

                except Exception as e:
                    print(f"⚠️ Fehler beim Scrapen von {subreddit_name} mit Suchbegriff '{term}': {e}")
                    time.sleep(60)  # Falls ein API-Limit erreicht wird, eine Minute pausieren

    # 🔹 DataFrames erstellen
    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"\n✅ Scrape abgeschlossen: {len(df_posts)} Posts und {len(df_comments)} Kommentare gespeichert.")
    return df_posts, df_comments

# 🔹 Zeitraum definieren
start_of_period = datetime(2024, 11, 1, tzinfo=timezone.utc)
now = datetime.now(timezone.utc)

print(f"🚀 Starte Scraping-Prozess für {start_of_period.strftime('%Y-%m-%d')} bis {now.strftime('%Y-%m-%d')}...")

# 🔹 Scraper starten
df_posts, df_comments = scrape_reddit(start_of_period, now)

# 🔍 Debugging: Erste Zeilen ausgeben
print("\n🔍 ERSTE ZEILEN DER POSTS:")
print(df_posts.head())

print("\n🔍 ERSTE ZEILEN DER KOMMENTARE:")
print(df_comments.head())



🚀 Starte Scraping-Prozess für 2024-11-01 bis 2025-02-15...

🔎 Scraping r/CryptoCurrency...
🔍 Suche nach: ethereum in r/CryptoCurrency...
✅ Post gefunden: Tipping system like Ko-fi or Buy Me a Coffee but with crypto ? (Suchbegriff: ethereum)
✅ Post gefunden: What Big Companies Are Building on Ethereum (Suchbegriff: ethereum)
✅ Post gefunden: Tired of the same limitations on Ethereum? There's a “Movement” on the horizon! (Suchbegriff: ethereum)
✅ Post gefunden: Solana Apps Generate 10x More Revenue Than Ethereum: Research (Suchbegriff: ethereum)
✅ Post gefunden: Bitcoin ETFs maintain market lead as Ethereum ETFs see surge in institutional adoption, 13F filings show (Suchbegriff: ethereum)
✅ Post gefunden: XRP: Fomo or Truth? (Suchbegriff: ethereum)
✅ Post gefunden: Ethereum Pectra Upgrade Confirmed for April 2025 with Fusaka Next in Line (Suchbegriff: ethereum)
✅ Post gefunden: Why Vitalik Buterin’s ‘make communism great again’ quip triggered Ethereum investors (Suchbegriff: ethereum)
✅ 

## Clean

In [None]:
def normalize_reddit_data(df_posts, df_comments, comment_threshold=500):
    print("🚀 Starte Normalisierung der Daten...")

    # 1️⃣ 🗑️ **Duplikate entfernen**
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])

    # 2️⃣ 🔍 **Fehlende Werte auffüllen**
    for col in ["selftext", "title"]:
        if col in df_posts.columns:
            df_posts[col] = df_posts[col].fillna("")
        if col in df_comments.columns:
            df_comments[col] = df_comments[col].fillna("")

    # 3️⃣ 📝 **Full Text für NLP & Sentiment-Analyse erstellen**
    df_posts["full_text"] = (df_posts["title"] + " " + df_posts["selftext"]).str.strip()
    df_comments["full_text"] = df_comments["selftext"].str.strip()

    # 4️⃣ 🏷️ **Datum & Uhrzeit normalisieren**
    for df in [df_posts, df_comments]:
        df["date"] = pd.to_datetime(df["created_utc"], errors="coerce").dt.date
        df["time"] = pd.to_datetime(df["created_utc"], errors="coerce").dt.time

    # 5️⃣ 🚀 **Spalten-Typen konvertieren**
    df_posts["score"] = df_posts["score"].astype(int, errors="ignore")
    df_comments["score"] = df_comments["score"].astype(int, errors="ignore")

    # 6️⃣ 🚫 **Spam-Nutzer mit extrem vielen Kommentaren filtern**
    if "author" in df_comments.columns:
        frequent_users = df_comments["author"].value_counts()
        frequent_users = frequent_users[frequent_users > comment_threshold].index
        df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    # 7️⃣ ✂️ **Redundante Spalten entfernen**
    drop_columns_posts = ["created_utc", "num_comments"] if "num_comments" in df_posts.columns else ["created_utc"]
    drop_columns_comments = ["created_utc"]

    df_posts = df_posts.drop(columns=drop_columns_posts, errors="ignore")
    df_comments = df_comments.drop(columns=drop_columns_comments, errors="ignore")

    # 8️⃣ 🔄 **Spalten-Reihenfolge anpassen**  
    df_posts = df_posts[["post_id", "subreddit", "crypto", "search_term", "title", "selftext", "full_text", "author", "score", "date", "time"]]
    df_comments = df_comments[["comment_id", "post_id", "author", "score", "selftext", "full_text", "date", "time"]]

    print(f"✅ Normalisierung abgeschlossen: {df_posts.shape[0]} Posts, {df_comments.shape[0]} Kommentare übrig.")
    return df_posts, df_comments


In [None]:
# Bereinigen der Daten
df_posts_clean, df_comments_clean = clean_data(df_posts, df_comments)

# Überprüfen, wie viele Einträge übrig sind
print(f"Bereinigte Posts: {len(df_posts_clean)}")
print(f"Bereinigte Kommentare: {len(df_comments_clean)}")

🚀 Starte Datenbereinigung...
✅ Bereinigung abgeschlossen: 7670 Posts, 284500 Kommentare übrig.
Bereinigte Posts: 7670
Bereinigte Kommentare: 284500


## Model fuer das Sentiment 

In [66]:
# 🔹 GPU nutzen, falls verfügbar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results

# 🔹 Sentiment für **Posts** berechnen
tqdm.pandas()  # Fortschrittsanzeige aktivieren
df_posts_clean["full_text"] = df_posts_clean["title"] + " " + df_posts_clean["selftext"].fillna("")
df_posts_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts_clean["full_text"].tolist()), index=df_posts_clean.index
)

# 🔹 Sentiment für **Kommentare** berechnen
df_comments_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments_clean["selftext"].tolist()), index=df_comments_clean.index
)

# 🔹 Ergebnisse anzeigen
print(f"✅ Sentiment-Analyse abgeschlossen: {len(df_posts_clean)} Posts & {len(df_comments_clean)} Kommentare bewertet.")


🚀 Verwende Gerät: cuda


🔍 Analysiere Sentiments: 100%|██████████| 240/240 [02:04<00:00,  1.93it/s]
🔍 Analysiere Sentiments: 100%|██████████| 8891/8891 [31:51<00:00,  4.65it/s]  


✅ Sentiment-Analyse abgeschlossen: 7670 Posts & 284500 Kommentare bewertet.


## Merge

In [88]:
def merge_reddit_data(df_posts, df_comments):
    print("🚀 Starte Merging der Posts & Kommentare...")

    # 🔹 **Posts: "full_text" = title + selftext**
    df_posts["full_text"] = df_posts["title"].fillna("") + " " + df_posts["selftext"].fillna("")
    df_posts["full_text"] = df_posts["full_text"].str.strip()

    # 🔹 **Kommentare: "full_text" = selftext**
    df_comments["full_text"] = df_comments["selftext"].fillna("")

    # 🔹 **Standardisierte Spalten für das Merging**
    df_posts["comment_id"] = None  # Posts haben keine "comment_id"
    df_comments["title"] = None  # Kommentare haben keinen Titel

    # 🔹 **"type"-Spalte zur Unterscheidung**
    df_posts["type"] = "post"
    df_comments["type"] = "comment"

    # 🔹 **Gemeinsame Spalten für das Merging**
    common_columns = [
        "post_id", "comment_id", "author", "score", "selftext",
        "full_text", "sentiment", "sentiment_confidence", "type", "date", "time"
    ]

    # 🔹 **Merging der Daten (zusammenführen von Posts & Kommentaren)**
    df_merged = pd.concat([df_posts[common_columns], df_comments[common_columns]], ignore_index=True)

    # 🔹 **Datentypen normalisieren**
    df_merged["score"] = df_merged["score"].astype(int, errors="ignore")
    df_merged["sentiment_confidence"] = df_merged["sentiment_confidence"].astype(float, errors="ignore")

    print(f"✅ Merging abgeschlossen: {df_merged.shape[0]} Einträge (Posts + Kommentare).")
    return df_merged


In [89]:
df_merged = merge_reddit_data(df_posts_clean, df_comments_clean)

🚀 Starte Merging der Posts & Kommentare...
✅ Merging abgeschlossen: 292170 Einträge (Posts + Kommentare).


## Export 

In [90]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")

Funktion zum Export 

In [91]:
def append_to_csv(df_new, filename, key_column):
    """Hängt neue Daten an eine bestehende CSV an & entfernt Duplikate."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # Falls Datei existiert, alte Daten einlesen
        if os.path.exists(file_path):
            df_existing = pd.read_csv(file_path, sep="|", encoding="utf-8-sig", on_bad_lines="skip")
            
            # 🔹 Daten zusammenführen & Duplikate nach `key_column` entfernen (neuere Werte behalten)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=[key_column], keep="last")
        else:
            df_combined = df_new  # Falls keine Datei existiert, neue Daten direkt nutzen

        # 🔹 CSV speichern
        df_combined.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich aktualisiert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_to_drive(df_posts, df_comments, df_merged):
    """Speichert Posts, Kommentare & die gemergte Datei mit Duplikat-Prüfung."""
    try:
        append_to_csv(df_posts, "reddit_posts.csv", key_column="post_id")
        append_to_csv(df_comments, "reddit_comments.csv", key_column="comment_id")
        append_to_csv(df_merged, "reddit_merged.csv", key_column="comment_id")  # Falls Kommentare entscheidend sind

    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [92]:
# 🔹 Export-Funktion aufrufen
export_to_drive(df_posts_clean, df_comments_clean, df_merged)

✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_posts.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_comments.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_merged.csv


In [93]:
df_merged.head()

Unnamed: 0,post_id,comment_id,author,score,selftext,full_text,sentiment,sentiment_confidence,type,date,time
0,1iq5mon,,Nayko93,9,Hi\n\nI'm looking for a way to accept small ti...,Tipping system like Ko-fi or Buy Me a Coffee b...,neutral,0.633046,post,2025-02-15,16:54:53
1,1iq4hwf,,MinimalGravitas,10,,What Big Companies Are Building on Ethereum,bearish,0.360482,post,2025-02-15,16:03:13
2,1iq2xh0,,NoahCJ,0,Are you a blockchain developer or enthusiast l...,Tired of the same limitations on Ethereum? The...,bullish,0.749526,post,2025-02-15,14:50:15
3,1iq1v1t,,Afonsoo99,0,,Solana Apps Generate 10x More Revenue Than Eth...,neutral,0.773272,post,2025-02-15,13:57:02
4,1ipm5nu,,partymsl,23,,Bitcoin ETFs maintain market lead as Ethereum ...,neutral,0.799403,post,2025-02-14,21:58:03
