# Reddit Scraper für vergangene Daten 

## Import 

In [1]:
import praw
import pandas as pd
from datetime import datetime, timedelta
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch


In [2]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders heißt, anpassen
# Prüfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [3]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [4]:
try:
    subreddit = reddit.subreddit("CryptoCurrency")
    for post in subreddit.hot(limit=5):
        print(f"Title: {post.title}, Score: {post.score}")
except Exception as e:
    print(f"Fehler beim Abrufen der Subreddit-Daten: {e}")


Title: Moon Week 58, Score: 7
Title: Daily Crypto Discussion - February 10, 2025 (GMT+0), Score: 17
Title: Just give it a minute, Score: 2142
Title: Lost Fortune: Landfill Containing $750M in Bitcoin to Be Sealed Forever, Score: 888
Title: Odds of Kanye West Launching Token Plummet After He Says ‘Coins Prey on Fans’, Score: 175


Cryptos und Subreddits 

In [None]:
cryptos = [
    # Top Coins
    ("Bitcoin", "BTC"),
    ("Ethereum", "ETH"),
    ("Wrapped Ethereum", "WETH"),
    ("Solana", "SOL"),
    ("Avalanche", "AVAX"),
    ("Polkadot", "DOT"),
    ("Near Protocol", "NEAR"),
    ("Polygon", "MATIC"),
    ("XRP", "XRP"),
    ("Cardano", "ADA"),
    ("Cronos", "CRO"),
    ("Vulcan Forged PYR", "PYR"),
    ("Chiliz", "CHZ"),
    ("Illuvium", "ILV"),
    ("Ronin", "RON"),
    ("Band Protocol", "BAND"),
    ("Optimism", "OP"),
    ("Celestia", "TIA"),
    ("Numerai", "NMR"),
    ("Aethir", "ATH"),
    ("Sui", "SUI"),
    ("Hyperliquid", "HYP"),
    ("Robinhood Coin", "HOOD"),
    ("Trump Coin", "TRUMP"),
    ("USD Coin", "USDC"),
    ("Binance Coin", "BNB"),
    ("Litecoin", "LTC"),
    ("Dogecoin", "DOGE"),
    ("Tron", "TRX"),
    ("Aave", "AAVE"),
    ("Hedera", "HBAR"),
    ("Filecoin", "FIL"),
    ("Cosmos", "ATOM"),
    ("Gala", "GALA"),
    ("The Sandbox", "SAND"),
    ("Audius", "AUDIO"),
    ("Render", "RNDR"),
    ("Kusama", "KSM"),
    ("VeChain", "VET"),
    ("Chainlink", "LINK"),
    ("Berachain", "BERA"),
    ("TestCoin", "TEST"),

    # Meme-Coins
    ("Dogecoin", "DOGE"),
    ("Shiba Inu", "SHIB"),
    ("Pepe", "PEPE"),
    ("Floki Inu", "FLOKI"),
    ("Bonk", "BONK"),
    ("Wojak", "WOJAK"),
    ("Mog Coin", "MOG"),
    ("Doge Killer (Leash)", "LEASH"),
    ("Baby Doge Coin", "BABYDOGE"),
    ("Degen", "DEGEN"),
    ("Toshi", "TOSHI"),
    ("Fartcoin", "FART"),
    ("Banana", "BANANA"),
    ("Kabosu", "KABOSU"),
    ("Husky", "HUSKY"),
    ("Samoyedcoin", "SAMO"),
    ("Milkbag", "MILKBAG")
]



In [9]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen über Kryptowährungen
    "CryptoMarkets",   # Diskussionen über den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen
    "Altcoin",         # Diskussionen über Altcoins (alle Kryptowährungen außer Bitcoin)
    "DeFi",            # Decentralized Finance (DeFi) und Projekte
    "BitcoinBeginners",# Für Anfänger in der Krypto-Welt
    "cryptotechnology", # Fokus auf die zugrunde liegende Blockchain-Technologie
    "cryptocurrencies", # Allgemeine Diskussionen über Kryptowährungen
    "Satoshistreetsbets", # Krypto-Wetten und Spekulationen
    "Binance"        # Diskussionen über die Binance-Plattform  
]

## Scraping 

Scraping Funktionen

In [10]:
# Funktion, um Posts und Kommentare gezielt zu Kryptowährungen zu scrapen
def scrape_reddit(start_date, end_date, mode="initial"):
    start_timestamp = int(start_date.timestamp())  # Umwandlung in Unix-Zeit
    end_timestamp = int(end_date.timestamp())  # Umwandlung in Unix-Zeit

    posts = []
    comments = []
    post_ids = set()  # Vermeidung doppelter Post-IDs

    for crypto_name, crypto_symbol in cryptos:
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"Suche nach {crypto_name} ({crypto_symbol}) in r/{subreddit_name}...")

            # 🔹 Suchbegriffe in Kleinbuchstaben umwandeln
            search_terms = [crypto_name.lower(), crypto_symbol.lower()]

            for search_term in search_terms:
                for post in subreddit.search(query=search_term, sort="new", limit=None):
                    if start_timestamp <= post.created_utc <= end_timestamp and post.id not in post_ids:
                        post_ids.add(post.id)

                        # 🔹 Titel & Selbsttext in Kleinbuchstaben für case-insensitive Vergleich
                        post_title = post.title.lower()
                        post_selftext = (post.selftext or "").lower()

                        posts.append({
                            'crypto': crypto_name,
                            'search_term': search_term.upper(),
                            'subreddit': subreddit_name,
                            'post_id': post.id,
                            'title': post_title,
                            'author': str(post.author),
                            'created_utc': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            'score': post.score,
                            'num_comments': post.num_comments,
                            'selftext': post_selftext
                        })

                        print(f"Post gefunden: {post_title} (Suchbegriff: {search_term})")

                        # 🔹 Kommentare sammeln
                        post.comments.replace_more(limit=0)
                        for comment in post.comments.list():
                            comments.append({
                                'post_id': post.id,
                                'comment_id': comment.id,
                                'author': str(comment.author),
                                'created_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                'score': comment.score,
                                'body': (comment.body or "").lower()  # 🔹 Case-Insensitive Kommentartext
                            })

    # In DataFrames umwandeln
    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"Scrape abgeschlossen: {len(df_posts)} Posts und {len(df_comments)} Kommentare gefunden.")
    return df_posts, df_comments

# Einmaliger Scrape für die letzten 3 Monate
start_of_period = datetime(2024, 11, 1)  # 1. November 2024 als `datetime`
now = datetime.now()  # Aktuelle Zeit als `datetime`
print("Starte den einmaligen Scrape für die letzten 3 Monate...")
df_posts_initial, df_comments_initial = scrape_reddit(start_of_period, now, mode="initial")

print("Daten können jetzt in der Pipeline bereinigt werden...")


Starte den einmaligen Scrape für die letzten 3 Monate...
Suche nach Solana (SOL) in r/CryptoCurrency...
Post gefunden: crypto beginner advice (Suchbegriff: solana)
Post gefunden: gemini “security hold” hostage negotiations - receipts (Suchbegriff: solana)
Post gefunden: sec reviews grayscale’s solana etf filing, indicating possible shift in crypto regulation (Suchbegriff: solana)
Post gefunden: solana’s could hit $520 by 2025-end, vaneck says (Suchbegriff: solana)
Post gefunden: best crypto based on priority (Suchbegriff: solana)
Post gefunden: small trading website for photon ( (Suchbegriff: solana)
Post gefunden: bull market over? (Suchbegriff: solana)
Post gefunden: best crypto coins to hold for 10 years? (Suchbegriff: solana)
Post gefunden: solana celebrates a year without network failures as ecosystem thrives (Suchbegriff: solana)
Post gefunden: solana seems too volatile to consider holding long term (Suchbegriff: solana)
Post gefunden: if alternative cryptocurrency coins and toke

## Clean

In [11]:
def clean_data(df_posts, df_comments, comment_threshold=500):# Anpassbarer Schwellenwert für Kommentare pro Nutzer
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])
    
    # 2. Fehlende Werte behandeln
    df_posts['selftext'] = df_posts['selftext'].fillna('')  # Fehlende Posttexte auffüllen
    df_comments['body'] = df_comments['body'].fillna('')  # Fehlende Kommentare auffüllen
    
    # 3. Zeitstempel konvertieren
    df_posts['created_utc'] = pd.to_datetime(df_posts['created_utc'])
    df_comments['created_utc'] = pd.to_datetime(df_comments['created_utc'])

    # 4. Datum & Uhrzeit in separate Spalten aufteilen (Daten normalisieren)
    df_posts["date"] = df_posts["created_utc"].dt.date  # YYYY-MM-DD
    df_posts["time"] = df_posts["created_utc"].dt.time  # HH:MM:SS

    df_comments["date"] = df_comments["created_utc"].dt.date
    df_comments["time"] = df_comments["created_utc"].dt.time

    # 5. Original `created_utc`-Spalte entfernen
    df_posts.drop(columns=["created_utc"], inplace=True)
    df_comments.drop(columns=["created_utc"], inplace=True)

    # 6. Filterung nach Qualität (Spam oder irrelevante Daten entfernen)
    df_posts = df_posts[df_posts['score'] > 0]  # Posts mit negativem Score entfernen
    df_comments = df_comments[df_comments['score'] > 0]  # Kommentare mit negativem Score entfernen

    # 7. Entferne bekannte Bot-Accounts
    bot_accounts = ["AutoModerator", "coinfeeds-bot", "devCheckingIn"]
    df_comments = df_comments[~df_comments["author"].isin(bot_accounts)]

    # 8. Entferne Nutzer mit übermäßigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index  # Nutzer über Grenze
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    print(f"Daten bereinigt: {df_comments.shape[0]} Kommentare übrig (nach Spam-Filter).")

    return df_posts, df_comments


In [12]:
# Bereinigen der Daten
df_posts_clean, df_comments_clean = clean_data(df_posts_initial, df_comments_initial, comment_threshold=300) # Anpassbarer Schwellenwert für Kommentare pro Nutzer


# Überprüfen, wie viele Einträge übrig sind
print(f"Bereinigte Posts: {len(df_posts_clean)}")
print(f"Bereinigte Kommentare: {len(df_comments_clean)}")

Daten bereinigt: 56187 Kommentare übrig (nach Spam-Filter).
Bereinigte Posts: 1051
Bereinigte Kommentare: 56187


## Model fuer das Sentiment 

In [14]:
# CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# 🔹 Sicherstellen, dass die Spalte "body" existiert
if "body" not in df_comments_clean.columns:
    raise ValueError("Fehler: Die CSV-Datei enthält keine 'body'-Spalte mit Kommentaren!")

# 🔹 Funktion zur Sentiment-Analyse mit CryptoBERT
def analyze_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", 0.0  # Leere Kommentare sind neutral

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    scores = F.softmax(outputs.logits, dim=1)[0]
    labels = ["bearish", "neutral", "bullish"]  # CryptoBERT nutzt diese Labels
    sentiment = labels[torch.argmax(scores).item()]
    confidence = scores.max().item()

    return sentiment, confidence

# 🔹 Sentiment für alle Kommentare berechnen
df_comments_clean["sentiment"], df_comments_clean["sentiment_confidence"] = zip(*df_comments_clean["body"].map(analyze_sentiment))

# 🔹 Debug-Ausgabe: Zeigt die ersten 5 Ergebnisse zur Überprüfung
print(df_comments_clean[["body", "sentiment", "sentiment_confidence"]].head())


tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

                                                body sentiment  \
0  “trying to replicate even half of that return ...   neutral   
1  got in at 3 and out ar 40? you don't sound lik...   bullish   
2  don't bother with crypto. it's a waste of time...   neutral   
3  don't waste time trying to get rich.  getting ...   bearish   
4  i would exit the market at this situation. try...   neutral   

   sentiment_confidence  
0              0.680237  
1              0.595430  
2              0.813621  
3              0.865439  
4              0.738003  


## Merge 

In [15]:
# 🔹 Merging der Posts & Kommentare direkt nach der Bereinigung
df_merged = df_comments_clean.merge(df_posts_clean, on="post_id", how="left")

# 🔹 Fehlende Werte entfernen (optional)
df_merged.dropna(inplace=True)

In [21]:
df_merged.head()

Unnamed: 0,post_id,comment_id,author_x,score_x,body,date_x,time_x,sentiment,sentiment_confidence,crypto,search_term,subreddit,title,author_y,score_y,num_comments,selftext,date_y,time_y
17,1ijw8gh,mbhi8gf,partymsl,1,"institutions also want to buy this dip, deeply...",2025-02-07,14:53:24,bullish,0.690653,Solana,SOLANA,CryptoCurrency,"sec reviews grayscale’s solana etf filing, ind...",Afonsoo99,7.0,3.0,,2025-02-07,14:29:29
26,1ijo3v7,mbghe3b,AnyMeaning1888,11,thanks chatgpt,2025-02-07,10:38:23,neutral,0.660872,Solana,SOLANA,CryptoCurrency,best crypto based on priority,TegridyWackyTobaccy,6.0,21.0,the “best” cryptocurrency depends on your goal...,2025-02-07,05:48:10
27,1ijo3v7,mbh0lqb,severin_dfinity,5,happy to answer questions about icp. i work at...,2025-02-07,13:12:02,bullish,0.734091,Solana,SOLANA,CryptoCurrency,best crypto based on priority,TegridyWackyTobaccy,6.0,21.0,the “best” cryptocurrency depends on your goal...,2025-02-07,05:48:10
28,1ijo3v7,mbfns29,Gator222222,10,bitcoin will continue to rise simply because i...,2025-02-07,05:52:06,bullish,0.634316,Solana,SOLANA,CryptoCurrency,best crypto based on priority,TegridyWackyTobaccy,6.0,21.0,the “best” cryptocurrency depends on your goal...,2025-02-07,05:48:10
29,1ijo3v7,mbfqc7k,aaaanoon,7,"eth is the most popular for smart contracts, n...",2025-02-07,06:12:52,neutral,0.858543,Solana,SOLANA,CryptoCurrency,best crypto based on priority,TegridyWackyTobaccy,6.0,21.0,the “best” cryptocurrency depends on your goal...,2025-02-07,05:48:10


## Export 

In [23]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")

Funktion zum Export 

In [24]:
def save_initial_csv(df_new, filename):
    """Speichert die erste CSV-Datei ohne Anhängen oder Duplikat-Prüfung."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # 🔹 Falls die Datei existiert, sie komplett überschreiben
        df_new.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich gespeichert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_initial_data(df_posts, df_comments, df_merged):
    """Speichert die initialen Posts, Kommentare & gemergten Daten."""
    try:
        save_initial_csv(df_posts, "reddit_posts.csv")
        save_initial_csv(df_comments, "reddit_comments.csv")
        save_initial_csv(df_merged, "reddit_merged.csv")

    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [25]:
# 🔹 Export-Funktion für den ersten Scrape aufrufen
export_initial_data(df_posts_clean, df_comments_clean, df_merged)

✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_posts.csv
✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_comments.csv
✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_merged.csv
