# Reddit Scraper

## Import 

In [9]:
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI
from praw.exceptions import APIException


In [10]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders heißt, anpassen
# Prüfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [11]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [12]:
for post in reddit.subreddit("CryptoCurrency").search("Bitcoin", sort="new", limit=5):
    print(post.title, post.created_utc)


U.S. Crypto Task Force to Focus on Delivering National Bitcoin (BTC) Reserve 1739803032.0
A story in two pictures 1739802752.0
A curated list of Bitcoin payment processors 1739801652.0
best way to invest a bigger sum into bnitcoin long term 1739792662.0
Metaplanet Buys Another ¥4.0 Billion Worth of Bitcoin (BTC) 1739787689.0


Cryptos und Subreddits 

In [13]:
crypto_terms = {
    # 🔹 Top Coins
    "Ethereum": ["ethereum", "eth", "ether", "ethereum 2.0", "eth 2.0"],
    "Wrapped Ethereum": ["wrapped ethereum", "weth"],
    "Solana": ["solana", "sol", "sol coin"],
    "Avalanche": ["avalanche", "avax"],
    "Polkadot": ["polkadot", "dot"],
    "Near Protocol": ["near protocol", "near"],
    "Polygon": ["polygon", "matic"],
    "XRP": ["xrp", "ripple"],
    "Cardano": ["cardano", "ada"],
    "Cronos": ["cronos", "cro"],
    "Vulcan Forged PYR": ["vulcan forged", "pyr"],
    "Chiliz": ["chiliz", "chz"],
    "Illuvium": ["illuvium", "ilv"],
    "Ronin": ["ronin", "ron"],
    "Band Protocol": ["band protocol", "band"],
    "Optimism": ["optimism", "op"],
    "Celestia": ["celestia", "tia"],
    "Numerai": ["numerai", "nmr"],
    "Aethir": ["aethir", "ath"],
    "Sui": ["sui"],
    "Hyperliquid": ["hyperliquid", "hyp"],
    "Robinhood Coin": ["robinhood", "hood"],
    "Trump Coin": ["trump coin"],
    "USD Coin": ["usd coin", "usdc"],
    "Binance Coin": ["binance", "bnb"],
    "Litecoin": ["litecoin", "ltc"],
    "Dogecoin": ["dogecoin", "doge"],
    "Tron": ["tron", "trx"],
    "Aave": ["aave"],
    "Hedera": ["hedera", "hbar"],
    "Filecoin": ["filecoin", "fil"],
    "Cosmos": ["cosmos", "atom"],
    "Gala": ["gala"],
    "The Sandbox": ["sandbox", "sand"],
    "Audius": ["audius", "audio"],
    "Render": ["render", "rndr"],
    "Kusama": ["kusama", "ksm"],
    "VeChain": ["vechain", "vet"],
    "Chainlink": ["chainlink", "link"],
    "Berachain": ["berachain", "bera"],
    "TestCoin": ["testcoin", "test"],

    # 🔹 Meme-Coins
    "Shiba Inu": ["shiba inu", "shib"],
    "Pepe": ["pepe"],
    "Floki Inu": ["floki inu", "floki"],
    "Bonk": ["bonk"],
    "Wojak": ["wojak"],
    "Mog Coin": ["mog"],
    "Doge Killer (Leash)": ["leash"],
    "Baby Doge Coin": ["baby doge", "babydoge"],
    "Degen": ["degen"],
    "Toshi": ["toshi"],
    "Fartcoin": ["fartcoin"],
    "Banana": ["banana"],
    "Kabosu": ["kabosu"],
    "Husky": ["husky"],
    "Samoyedcoin": ["samoyedcoin", "samo"],
    "Milkbag": ["milkbag"],

    # 🔹 New Coins
    "Arbitrum": ["arbitrum", "arb"],
    "Starknet": ["starknet", "strk"],
    "Injective Protocol": ["injective", "inj"],
    "Sei Network": ["sei"],
    "Aptos": ["aptos", "apt"],
    "EigenLayer": ["eigenlayer", "eigen"],
    "Mantle": ["mantle", "mnt"],
    "Immutable X": ["immutable x", "imx"],
    "Ondo Finance": ["ondo"],
    "Worldcoin": ["worldcoin", "wld"],
    "Aerodrome": ["aerodrome", "aero"],
    "Jupiter": ["jupiter", "jup"],
    "THORChain": ["thorchain", "rune"],
    "Pendle": ["pendle"],
    "Kujira": ["kujira", "kuji"],
    "Noble": ["noble"],
    "Stride": ["stride", "strd"],
    "Dymension": ["dymension", "dym"],
    "Seamless Protocol": ["seamless", "seam"],
    "Blast": ["blast"],
    "Merlin": ["merlin"],
    "Tapioca": ["tapioca"],
    "Arcadia Finance": ["arcadia"],
    "Notcoin": ["notcoin", "not"],
    "Omni Network": ["omni"],
    "LayerZero": ["layerzero", "lz"],
    "ZetaChain": ["zetachain", "zeta"],
    "Friend.tech": ["friendtech"]
}


In [14]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen über Kryptowährungen
    "CryptoMarkets",   # Diskussionen über den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen
    "Altcoin",         # Diskussionen über Altcoins (alle Kryptowährungen außer Bitcoin)
    "DeFi",            # Decentralized Finance (DeFi) und Projekte
    "BitcoinBeginners",# Für Anfänger in der Krypto-Welt
    "cryptotechnology", # Fokus auf die zugrunde liegende Blockchain-Technologie
    "cryptocurrencies", # Allgemeine Diskussionen über Kryptowährungen
    "Satoshistreetsbets", # Krypto-Wetten und Spekulationen
    "Binance",        # Diskussionen über die Binance-Plattform  
    "Bitcoin",
    "ethtrader"
]

## Scraping 

Scraping Funktionen

In [15]:

api = PushshiftAPI()

# 🔹 Scraper für historische Reddit-Daten mit vollständigem Kommentarabruf
def scrape_reddit(start_date, end_date, mode="initial"):
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())

    posts = []
    comments = []
    post_ids = set()

    for crypto_name, search_terms in crypto_terms.items():
        for subreddit_name in subreddits:
            print(f"🔍 Suche nach {crypto_name} in r/{subreddit_name}...")

            try:
                # 🟢 Pushshift für historische Posts nutzen
                submissions = list(api.search_submissions(
                    after=start_timestamp,
                    before=end_timestamp,
                    subreddit=subreddit_name,
                    q="|".join(search_terms),
                    filter=["id", "title", "selftext", "author", "created_utc", "score", "num_comments"],
                    limit=5000
                ))

                # 🛑 Falls Pushshift keine Posts liefert, Fallback auf `praw.search()`
                if not submissions:
                    print(f"⚠️ Keine Pushshift-Daten für r/{subreddit_name}, verwende `praw.search()`...")
                    subreddit = reddit.subreddit(subreddit_name)
                    submissions = list(subreddit.search("|".join(search_terms), sort="new", time_filter="all", limit=500))

                for post in submissions:
                    post_id = post.id
                    if post_id not in post_ids:
                        post_ids.add(post_id)
                        created_dt = datetime.utcfromtimestamp(post.created_utc)

                        posts.append({
                            'post_id': post_id,
                            'crypto': crypto_name,
                            'search_term': next((term for term in search_terms if term in post.title.lower() or term in post.selftext.lower()), None),
                            'subreddit': subreddit_name,
                            'title': post.title,
                            'author': str(post.author),
                            'date': created_dt.date().isoformat(),
                            'time': created_dt.time().isoformat(),
                            'score': post.score,
                            'num_comments': post.num_comments,
                            'selftext': post.selftext
                        })
                        print(f"✅ Post gefunden: {post.title}")

                        # 🟢 Vollständige Kommentarabfrage mit praw
                        try:
                            submission = reddit.submission(id=post_id)
                            submission.comments.replace_more(limit=0)  # Entfernt Strukturkommentare

                            for comment in submission.comments.list():
                                created_dt = datetime.utcfromtimestamp(comment.created_utc)
                                comments.append({
                                    'post_id': post_id,
                                    'comment_id': comment.id,
                                    'author': str(comment.author),
                                    'date': created_dt.date().isoformat(),
                                    'time': created_dt.time().isoformat(),
                                    'score': comment.score,
                                    'selftext': comment.body
                                })

                        except APIException as e:
                            print(f"⚠️ Fehler beim Abrufen der Kommentare: {e}")

            except Exception as e:
                print(f"❌ API-Fehler: {e}. Warte 60 Sekunden und versuche es erneut...")
                time.sleep(60)

    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"✅ Scrape abgeschlossen: {len(df_posts)} Posts & {len(df_comments)} Kommentare gefunden.")
    return df_posts, df_comments

# 🔹 Starte den Scraper für die letzten 3 Monate
start_of_period = datetime(2024, 11, 1, tzinfo=timezone.utc)
now = datetime.now(timezone.utc)
df_posts, df_comments = scrape_reddit(start_of_period, now)

print("✅ Daten erfolgreich gespeichert & bereit für weitere Analysen.")


KeyboardInterrupt: 

## Clean

In [None]:
def clean_data(df_posts, df_comments, comment_threshold=500):
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])

    # 2. Fehlende Werte behandeln
    df_posts['selftext'] = df_posts['selftext'].fillna('')
    df_comments['body'] = df_comments['body'].fillna('')

    # 8. Entferne Nutzer mit übermäßigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    print(f"✅ Daten bereinigt: {df_comments.shape[0]} Kommentare übrig (nach Spam-Filter).")

    return df_posts, df_comments


In [None]:
df_posts_clean, df_comments_clean = clean_data(df_posts, df_comments, comment_threshold=500)

## Sentiment

In [None]:
# 🔹 GPU nutzen, falls verfügbar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results

# 🔹 Sentiment für **Posts** berechnen
tqdm.pandas()  # Fortschrittsanzeige aktivieren
df_posts_clean["full_text"] = df_posts_clean["title"] + " " + df_posts_clean["selftext"].fillna("")
df_posts_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts_clean["full_text"].tolist()), index=df_posts_clean.index
)

# 🔹 Sentiment für **Kommentare** berechnen
df_comments_clean["full_text"] = df_comments_clean["body"].fillna("")
df_comments_clean[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments_clean["full_text"].tolist()), index=df_comments_clean.index
)

# 🔹 Ergebnisse anzeigen
print(f"✅ Sentiment-Analyse abgeschlossen: {len(df_posts_clean)} Posts & {len(df_comments_clean)} Kommentare bewertet.")


## Merge

In [None]:
# 🔹 DataFrames zusammenführen
df_merged = df_posts_clean.merge(df_comments_clean, on="post_id", how="left", suffixes=("_post", "_comment"))

# 🔹 Ergebnisse anzeigen
print(f"✅ DataFrames zusammengeführt: {df_merged.shape[0]} Zeilen insgesamt.")

# 🔹 Optional: Null-Werte in Kommentar-Spalten auffüllen (falls nötig)
df_merged.fillna({"comment_id": "", "author_comment": "", "date_comment": "", "time_comment": "", 
                  "score_comment": 0, "body": "", "sentiment_comment": "", "sentiment_confidence_comment": 0.0}, inplace=True)


## Export 

In [None]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")

Funktion zum Export 

In [None]:
def append_to_csv(df_new, filename, key_column):
    """Hängt neue Daten an eine bestehende CSV an & entfernt Duplikate."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # Falls Datei existiert, alte Daten einlesen
        if os.path.exists(file_path):
            df_existing = pd.read_csv(file_path, sep="|", encoding="utf-8-sig", on_bad_lines="skip")
            
            # 🔹 Daten zusammenführen & Duplikate nach `key_column` entfernen (neuere Werte behalten)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=[key_column], keep="last")
        else:
            df_combined = df_new  # Falls keine Datei existiert, neue Daten direkt nutzen

        # 🔹 CSV speichern
        df_combined.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich aktualisiert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_to_drive(df_posts, df_comments, df_merged):
    """Speichert Posts, Kommentare & die gemergte Datei mit Duplikat-Prüfung."""
    try:
        append_to_csv(df_posts, "reddit_posts.csv", key_column="post_id")
        append_to_csv(df_comments, "reddit_comments.csv", key_column="comment_id")
        append_to_csv(df_merged, "reddit_merged.csv", key_column="comment_id")  # Falls Kommentare entscheidend sind

    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [None]:
# 🔹 Export-Funktion aufrufen
export_to_drive(df_posts_clean, df_comments_clean, df_merged)

✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_posts.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_comments.csv
✅ Datei erfolgreich aktualisiert: G:/Meine Ablage/reddit/reddit_merged.csv


In [None]:
df_merged.sentiment.value_counts()  

NameError: name 'df_merged' is not defined