# Reddit Scraper für vergangene Daten 

## Import 

In [1]:
import praw
import pandas as pd
from datetime import datetime, timedelta
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch


In [2]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders heißt, anpassen
# Prüfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [3]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [4]:
try:
    subreddit = reddit.subreddit("CryptoCurrency")
    for post in subreddit.hot(limit=5):
        print(f"Title: {post.title}, Score: {post.score}")
except Exception as e:
    print(f"Fehler beim Abrufen der Subreddit-Daten: {e}")


Title: [AMA] Giveaway with Portal to Bitcoin: Making bridges, wrapped coins, and external custody obsolete - Feb 12, Score: 17
Title: Moon Week 58, Score: 9
Title: More Than 800K Have Lost $2B on Trump’s Meme Coin, Score: 685
Title: Trump and His Family Earned Millions From Trump Coin While 810,000 Others Lost Money: Report, Score: 6394
Title: Lummis apparently called XRP a scam in a call tonight, Score: 333


Cryptos und Subreddits 

In [5]:
cryptos = [
    # Top Coins
    ("Bitcoin", "BTC"),
    ("Ethereum", "ETH"),
    ("Wrapped Ethereum", "WETH"),
    ("Solana", "SOL"),
    ("Avalanche", "AVAX"),
    ("Polkadot", "DOT"),
    ("Near Protocol", "NEAR"),
    ("Polygon", "MATIC"),
    ("XRP", "XRP"),
    ("Cardano", "ADA"),
    ("Cronos", "CRO"),
    ("Vulcan Forged PYR", "PYR"),
    ("Chiliz", "CHZ"),
    ("Illuvium", "ILV"),
    ("Ronin", "RON"),
    ("Band Protocol", "BAND"),
    ("Optimism", "OP"),
    ("Celestia", "TIA"),
    ("Numerai", "NMR"),
    ("Aethir", "ATH"),
    ("Sui", "SUI"),
    ("Hyperliquid", "HYP"),
    ("Robinhood Coin", "HOOD"),
    ("Trump Coin", "TRUMP"),
    ("USD Coin", "USDC"),
    ("Binance Coin", "BNB"),
    ("Litecoin", "LTC"),
    ("Dogecoin", "DOGE"),
    ("Tron", "TRX"),
    ("Aave", "AAVE"),
    ("Hedera", "HBAR"),
    ("Filecoin", "FIL"),
    ("Cosmos", "ATOM"),
    ("Gala", "GALA"),
    ("The Sandbox", "SAND"),
    ("Audius", "AUDIO"),
    ("Render", "RNDR"),
    ("Kusama", "KSM"),
    ("VeChain", "VET"),
    ("Chainlink", "LINK"),
    ("Berachain", "BERA"),
    ("TestCoin", "TEST"),

    # Meme-Coins
    ("Dogecoin", "DOGE"),
    ("Shiba Inu", "SHIB"),
    ("Pepe", "PEPE"),
    ("Floki Inu", "FLOKI"),
    ("Bonk", "BONK"),
    ("Wojak", "WOJAK"),
    ("Mog Coin", "MOG"),
    ("Doge Killer (Leash)", "LEASH"),
    ("Baby Doge Coin", "BABYDOGE"),
    ("Degen", "DEGEN"),
    ("Toshi", "TOSHI"),
    ("Fartcoin", "FART"),
    ("Banana", "BANANA"),
    ("Kabosu", "KABOSU"),
    ("Husky", "HUSKY"),
    ("Samoyedcoin", "SAMO"),
    ("Milkbag", "MILKBAG")
]



In [6]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen über Kryptowährungen
    "CryptoMarkets",   # Diskussionen über den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen
    "Altcoin",         # Diskussionen über Altcoins (alle Kryptowährungen außer Bitcoin)
    "DeFi",            # Decentralized Finance (DeFi) und Projekte
    "BitcoinBeginners",# Für Anfänger in der Krypto-Welt
    "cryptotechnology", # Fokus auf die zugrunde liegende Blockchain-Technologie
    "cryptocurrencies", # Allgemeine Diskussionen über Kryptowährungen
    "Satoshistreetsbets", # Krypto-Wetten und Spekulationen
    "Binance"        # Diskussionen über die Binance-Plattform  
]

## Scraping 

Scraping Funktionen

In [7]:
# Funktion, um Posts und Kommentare gezielt zu Kryptowährungen zu scrapen
def scrape_reddit(start_date, end_date, mode="initial"):
    start_timestamp = int(start_date.timestamp())  # Umwandlung in Unix-Zeit
    end_timestamp = int(end_date.timestamp())  # Umwandlung in Unix-Zeit

    posts = []
    comments = []
    post_ids = set()  # Vermeidung doppelter Post-IDs

    for crypto_name, crypto_symbol in cryptos:
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"Suche nach {crypto_name} ({crypto_symbol}) in r/{subreddit_name}...")

            # 🔹 Suchbegriffe in Kleinbuchstaben umwandeln
            search_terms = [crypto_name.lower(), crypto_symbol.lower()]

            for search_term in search_terms:
                for post in subreddit.search(query=search_term, sort="new", limit=None):
                    if start_timestamp <= post.created_utc <= end_timestamp and post.id not in post_ids:
                        post_ids.add(post.id)

                        # 🔹 Titel & Selbsttext in Kleinbuchstaben für case-insensitive Vergleich
                        post_title = post.title.lower()
                        post_selftext = (post.selftext or "").lower()

                        posts.append({
                            'crypto': crypto_name,
                            'search_term': search_term.upper(),
                            'subreddit': subreddit_name,
                            'post_id': post.id,
                            'title': post_title,
                            'author': str(post.author),
                            'created_utc': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            'score': post.score,
                            'num_comments': post.num_comments,
                            'selftext': post_selftext
                        })

                        print(f"Post gefunden: {post_title} (Suchbegriff: {search_term})")

                        # 🔹 Kommentare sammeln
                        post.comments.replace_more(limit=0)
                        for comment in post.comments.list():
                            comments.append({
                                'post_id': post.id,
                                'comment_id': comment.id,
                                'author': str(comment.author),
                                'created_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                'score': comment.score,
                                'body': (comment.body or "").lower()  # 🔹 Case-Insensitive Kommentartext
                            })

    # In DataFrames umwandeln
    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"Scrape abgeschlossen: {len(df_posts)} Posts und {len(df_comments)} Kommentare gefunden.")
    return df_posts, df_comments

# Einmaliger Scrape für die letzten 3 Monate
start_of_period = datetime(2024, 11, 1)  # 1. November 2024 als `datetime`
now = datetime.now()  # Aktuelle Zeit als `datetime`
print("Starte den einmaligen Scrape für die letzten 3 Monate...")
df_posts_initial, df_comments_initial = scrape_reddit(start_of_period, now, mode="initial")

print("Daten können jetzt in der Pipeline bereinigt werden...")


Starte den einmaligen Scrape für die letzten 3 Monate...
Suche nach Bitcoin (BTC) in r/CryptoCurrency...
Post gefunden: north carolina approves 10% bitcoin investment bill (Suchbegriff: bitcoin)
Post gefunden: bitcoin’s diamond hands double down, targets all-time high (Suchbegriff: bitcoin)
Post gefunden: tesla made $600 million in bitcoin profits during q4 of 2024 (Suchbegriff: bitcoin)
Post gefunden: 27 u.s. states push for bitcoin and digital asset legislation (Suchbegriff: bitcoin)
Post gefunden: [ama] giveaway with portal to bitcoin: making bridges, wrapped coins, and external custody obsolete - feb 12 (Suchbegriff: bitcoin)
Post gefunden: why everything positive you've heard about crypto is a trick (Suchbegriff: bitcoin)
Post gefunden: bitcoin teases gains as traders say $100k now key support reclaim (Suchbegriff: bitcoin)
Post gefunden: ethereum outpaces bitcoin as digital asset fund inflows hit $1.3 billion (Suchbegriff: bitcoin)
Post gefunden: nasdaq files to list xrp etf, sta

## Clean

In [8]:
def clean_data(df_posts, df_comments, comment_threshold=500):# Anpassbarer Schwellenwert für Kommentare pro Nutzer
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])
    
    # 2. Fehlende Werte behandeln
    df_posts['selftext'] = df_posts['selftext'].fillna('')  # Fehlende Posttexte auffüllen
    df_comments['body'] = df_comments['body'].fillna('')  # Fehlende Kommentare auffüllen
    
    # 3. Zeitstempel konvertieren
    df_posts['created_utc'] = pd.to_datetime(df_posts['created_utc'])
    df_comments['created_utc'] = pd.to_datetime(df_comments['created_utc'])

    # 4. Datum & Uhrzeit in separate Spalten aufteilen (Daten normalisieren)
    df_posts["date"] = df_posts["created_utc"].dt.date  # YYYY-MM-DD
    df_posts["time"] = df_posts["created_utc"].dt.time  # HH:MM:SS

    df_comments["date"] = df_comments["created_utc"].dt.date
    df_comments["time"] = df_comments["created_utc"].dt.time

    # 5. Original `created_utc`-Spalte entfernen
    df_posts.drop(columns=["created_utc"], inplace=True)
    df_comments.drop(columns=["created_utc"], inplace=True)

    # 6. Filterung nach Qualität (Spam oder irrelevante Daten entfernen)
    df_posts = df_posts[df_posts['score'] > 0]  # Posts mit negativem Score entfernen
    df_comments = df_comments[df_comments['score'] > 0]  # Kommentare mit negativem Score entfernen

    # 7. Entferne bekannte Bot-Accounts
    bot_accounts = ["AutoModerator", "coinfeeds-bot", "devCheckingIn"]
    df_comments = df_comments[~df_comments["author"].isin(bot_accounts)]

    # 8. Entferne Nutzer mit übermäßigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index  # Nutzer über Grenze
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    print(f"Daten bereinigt: {df_comments.shape[0]} Kommentare übrig (nach Spam-Filter).")

    return df_posts, df_comments


In [9]:
# Bereinigen der Daten
df_posts_clean, df_comments_clean = clean_data(df_posts_initial, df_comments_initial, comment_threshold=300) # Anpassbarer Schwellenwert für Kommentare pro Nutzer


# Überprüfen, wie viele Einträge übrig sind
print(f"Bereinigte Posts: {len(df_posts_clean)}")
print(f"Bereinigte Kommentare: {len(df_comments_clean)}")

Daten bereinigt: 187511 Kommentare übrig (nach Spam-Filter).
Bereinigte Posts: 3580
Bereinigte Kommentare: 187511


## Model fuer das Sentiment 

In [10]:
# CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# 🔹 Sicherstellen, dass die Spalte "body" existiert
if "body" not in df_comments_clean.columns:
    raise ValueError("Fehler: Die CSV-Datei enthält keine 'body'-Spalte mit Kommentaren!")

# 🔹 Funktion zur Sentiment-Analyse mit CryptoBERT
def analyze_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", 0.0  # Leere Kommentare sind neutral

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    scores = F.softmax(outputs.logits, dim=1)[0]
    labels = ["bearish", "neutral", "bullish"]  # CryptoBERT nutzt diese Labels
    sentiment = labels[torch.argmax(scores).item()]
    confidence = scores.max().item()

    return sentiment, confidence

# 🔹 Sentiment für alle Kommentare berechnen
df_comments_clean["sentiment"], df_comments_clean["sentiment_confidence"] = zip(*df_comments_clean["body"].map(analyze_sentiment))

# 🔹 Debug-Ausgabe: Zeigt die ersten 5 Ergebnisse zur Überprüfung
print(df_comments_clean[["body", "sentiment", "sentiment_confidence"]].head())


                                                 body sentiment  \
0   https://www.ncleg.gov/billlookup/2025/h92\n\n1...   neutral   
3                                           hodl it 🚀   bullish   
4   *on paper gains no realized profits\n\nthey di...   neutral   
10  i'm curious how this works out for them going ...   bullish   
11                              yeah profit on papers   neutral   

    sentiment_confidence  
0               0.660037  
3               0.758048  
4               0.592565  
10              0.622362  
11              0.595063  


## Merge 

In [11]:
# 🔹 Merging der Posts & Kommentare direkt nach der Bereinigung
df_merged = df_comments_clean.merge(df_posts_clean, on="post_id", how="left")

# 🔹 Fehlende Werte entfernen (optional)
df_merged.dropna(inplace=True)

In [12]:
df_merged.head()

Unnamed: 0,post_id,comment_id,author_x,score_x,body,date_x,time_x,sentiment,sentiment_confidence,crypto,search_term,subreddit,title,author_y,score_y,num_comments,selftext,date_y,time_y
0,1in1c89,mc75tqs,HSuke,1,https://www.ncleg.gov/billlookup/2025/h92\n\n1...,2025-02-11,15:45:48,neutral,0.660037,Bitcoin,BITCOIN,CryptoCurrency,north carolina approves 10% bitcoin investment...,Brodie266,1.0,1.0,,2025-02-11,15:36:14
1,1imzwht,mc735bo,Brodie266,1,hodl it 🚀,2025-02-11,15:32:45,bullish,0.758048,Bitcoin,BITCOIN,CryptoCurrency,"bitcoin’s diamond hands double down, targets a...",Illperformance6969,2.0,3.0,,2025-02-11,14:33:28
2,1imz7uj,mc6mm04,inShambles3749,6,*on paper gains no realized profits\n\nthey di...,2025-02-11,14:04:47,neutral,0.592565,Bitcoin,BITCOIN,CryptoCurrency,tesla made $600 million in bitcoin profits dur...,KIG45,1.0,9.0,,2025-02-11,14:01:24
3,1imz7uj,mc6q86g,Mikerk,1,i'm curious how this works out for them going ...,2025-02-11,14:25:19,bullish,0.622362,Bitcoin,BITCOIN,CryptoCurrency,tesla made $600 million in bitcoin profits dur...,KIG45,1.0,9.0,,2025-02-11,14:01:24
4,1imz7uj,mc73c7k,Brodie266,1,yeah profit on papers,2025-02-11,15:33:42,neutral,0.595063,Bitcoin,BITCOIN,CryptoCurrency,tesla made $600 million in bitcoin profits dur...,KIG45,1.0,9.0,,2025-02-11,14:01:24


## Export 

In [13]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")

Funktion zum Export 

In [14]:
def save_initial_csv(df_new, filename):
    """Speichert die erste CSV-Datei ohne Anhängen oder Duplikat-Prüfung."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # 🔹 Falls die Datei existiert, sie komplett überschreiben
        df_new.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich gespeichert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_initial_data(df_posts, df_comments, df_merged):
    """Speichert die initialen Posts, Kommentare & gemergten Daten."""
    try:
        save_initial_csv(df_posts, "reddit_posts.csv")
        save_initial_csv(df_comments, "reddit_comments.csv")
        save_initial_csv(df_merged, "reddit_merged.csv")

    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [15]:
# 🔹 Export-Funktion für den ersten Scrape aufrufen
export_initial_data(df_posts_clean, df_comments_clean, df_merged)

✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_posts.csv
✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_comments.csv
✅ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_merged.csv


In [16]:
df_comments_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187511 entries, 0 to 240563
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   post_id               187511 non-null  object 
 1   comment_id            187511 non-null  object 
 2   author                187511 non-null  object 
 3   score                 187511 non-null  int64  
 4   body                  187511 non-null  object 
 5   date                  187511 non-null  object 
 6   time                  187511 non-null  object 
 7   sentiment             187511 non-null  object 
 8   sentiment_confidence  187511 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 14.3+ MB


In [17]:
df_posts_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3580 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   crypto        3580 non-null   object
 1   search_term   3580 non-null   object
 2   subreddit     3580 non-null   object
 3   post_id       3580 non-null   object
 4   title         3580 non-null   object
 5   author        3580 non-null   object
 6   score         3580 non-null   int64 
 7   num_comments  3580 non-null   int64 
 8   selftext      3580 non-null   object
 9   date          3580 non-null   object
 10  time          3580 non-null   object
dtypes: int64(2), object(9)
memory usage: 335.6+ KB


In [18]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169708 entries, 0 to 187507
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   post_id               169708 non-null  object 
 1   comment_id            169708 non-null  object 
 2   author_x              169708 non-null  object 
 3   score_x               169708 non-null  int64  
 4   body                  169708 non-null  object 
 5   date_x                169708 non-null  object 
 6   time_x                169708 non-null  object 
 7   sentiment             169708 non-null  object 
 8   sentiment_confidence  169708 non-null  float64
 9   crypto                169708 non-null  object 
 10  search_term           169708 non-null  object 
 11  subreddit             169708 non-null  object 
 12  title                 169708 non-null  object 
 13  author_y              169708 non-null  object 
 14  score_y               169708 non-null  float64
 15  num_c