# Reddit Scraper

## Import 

In [1]:
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI
from praw.exceptions import APIException


In [2]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders heißt, anpassen
# Prüfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [3]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [4]:
for post in reddit.subreddit("CryptoCurrency").search("Bitcoin", sort="new", limit=5):
    print(post.title, post.created_utc)


This 12-year-old Kid “Erik Finman” in Idaho bought 83 BTC back in 2011, Instead of spending his $1,000 gift from his grandmother on video games or toys, he leaped into the unknown and bought Bitcoin. 1740064107.0
Google plans to integrate Bitcoin into its ecosystem via Bitcoin wallet 1740060914.0
Trump Ties Bitcoin’s Surge to His Policies, Promises U.S. Crypto Dominance 1740053378.0
Report shows uptick in startups using Bitcoin in their balance sheet 1740042298.0
Saylor’s Strategy Plans to Raise $2B to Buy More Bitcoin 1740038971.0


Cryptos und Subreddits 

In [5]:
crypto_terms = {
    # 🔹 Top Coins
    "Wrapped Ethereum": ["wrapped ethereum", "weth"],
    "Solana": ["solana", "sol", "sol coin"],
    "Avalanche": ["avalanche", "avax"],
    "Polkadot": ["polkadot", "dot"],
    "Near Protocol": ["near protocol", "near"],
    "Polygon": ["polygon", "matic"],
    "XRP": ["xrp", "ripple"],
    "Cardano": ["cardano", "ada"],
    "Cronos": ["cronos", "cro"],
    "Vulcan Forged PYR": ["vulcan forged", "pyr"],
    "Chiliz": ["chiliz", "chz"],
    "Illuvium": ["illuvium", "ilv"],
    "Ronin": ["ronin", "ron"],
}


In [6]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen über Kryptowährungen
    "CryptoMarkets",   # Diskussionen über den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen 
]

## Scraping 

Scraping Funktionen

In [7]:
  # 🔹 Funktion zum Scrapen von Posts und Kommentaren mit Backoff
def scrape_reddit(start_date, end_date, mode="initial"):
    start_timestamp = int(start_date.timestamp())
    end_timestamp = int(end_date.timestamp())

    posts = []
    comments = []
    post_ids = set()
    request_count = 0  # Zählt die Anzahl der Requests

    for crypto_name, search_terms in crypto_terms.items():
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"🔍 Suche nach {crypto_name} in r/{subreddit_name}...")

            # 🟢 Suche nach allen relevanten Begriffen mit `.search()`
            for search_term in search_terms:
                for post in subreddit.search(query=search_term, sort="new", limit=None):
                    if start_timestamp <= post.created_utc <= end_timestamp and post.id not in post_ids:
                        post_ids.add(post.id)
                        created_dt = datetime.utcfromtimestamp(post.created_utc)

                        posts.append({
                            'post_id': post.id,
                            'crypto': crypto_name,
                            'search_term': search_term,
                            'subreddit': subreddit_name,
                            'title': post.title,
                            'author': str(post.author),
                            'date': created_dt.date().isoformat(),
                            'time': created_dt.time().isoformat(),
                            'score': post.score,
                            'num_comments': post.num_comments,
                            'selftext': post.selftext
                        })
                        print(f"✅ Post gefunden: {post.title} (Suchbegriff: {search_term})")

                        # 🟢 Kommentare sammeln (mit Rate-Limit-Schutz)
                        try:
                            post.comments.replace_more(limit=0)
                            for comment in post.comments.list():
                                created_dt = datetime.utcfromtimestamp(comment.created_utc)
                                comments.append({
                                    'post_id': post.id,
                                    'comment_id': comment.id,
                                    'author': str(comment.author),
                                    'date': created_dt.date().isoformat(),
                                    'time': created_dt.time().isoformat(),
                                    'score': comment.score,
                                    'selftext': comment.body
                                })
                        except praw.exceptions.APIException as e:
                            if "RATELIMIT" in str(e):
                                print(f"⚠️ Reddit API-Limit erreicht. Warte 60 Sekunden...")
                                time.sleep(60)  # Wartezeit erhöhen
                            else:
                                print(f"⚠️ Fehler beim Abrufen der Kommentare: {e}")

                    # 🔄 Nach jeder `post.comments.list()` Anfrage prüfen, ob eine Pause nötig ist
                    request_count += 1
                    if request_count % 50 == 0:  # Nach 50 Requests eine kurze Pause
                        wait_time = 10  # Standard-Wartezeit
                        print(f"⏳ Warte {wait_time} Sekunden, um Rate-Limit zu vermeiden...")
                        time.sleep(wait_time)

    # In DataFrames umwandeln
    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"✅ Scrape abgeschlossen: {len(df_posts)} Posts & {len(df_comments)} Kommentare gefunden.")
    return df_posts, df_comments

# 🔹 Starte den Scraper für die letzten 3 Monate
start_of_period = datetime(2024, 11, 1)  # Startzeitpunkt
now = datetime.now()  # Aktueller Zeitpunkt
print("🚀 Starte den Scraper für die letzten 3 Monate...")
df_posts, df_comments = scrape_reddit(start_of_period, now)

print("✅ Daten erfolgreich gespeichert & bereit für weitere Analysen.")


🚀 Starte den Scraper für die letzten 3 Monate...
🔍 Suche nach Wrapped Ethereum in r/CryptoCurrency...
✅ Post gefunden: World Liberty Financial "Secretly" Accumulated $47M in Bitcoin (BTC) and $100M+ in Ethereum (ETH) (Suchbegriff: wrapped ethereum)
✅ Post gefunden: Ergo's 'Rosenbridge' to be connected to Nervos CKB (Suchbegriff: wrapped ethereum)
✅ Post gefunden: Question about old LUNA (Suchbegriff: wrapped ethereum)
✅ Post gefunden: Is it possible to recover unsupported crypto sent to an exchange? (Suchbegriff: wrapped ethereum)
⏳ Warte 10 Sekunden, um Rate-Limit zu vermeiden...
⏳ Warte 10 Sekunden, um Rate-Limit zu vermeiden...
⏳ Warte 10 Sekunden, um Rate-Limit zu vermeiden...
⏳ Warte 10 Sekunden, um Rate-Limit zu vermeiden...
✅ Post gefunden: Metamask compromised? (Suchbegriff: weth)
✅ Post gefunden: $3.317M in 3 hours on $VVV using Banana Gun (Suchbegriff: weth)
✅ Post gefunden: For the EIGHTH STRAIGHT YEAR: Announcing The Top Ten Crypto Index Fund Experiment for 2025! (Suchbegri

## Sentiment

In [8]:
# 🔹 GPU nutzen, falls verfügbar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results

# 🔹 Sentiment für **Posts** berechnen
tqdm.pandas()  # Fortschrittsanzeige aktivieren
df_posts["full_text"] = df_posts["title"] + " " + df_posts["selftext"].fillna("")
df_posts[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts["full_text"].tolist()), index=df_posts.index
)

# 🔹 Sentiment für **Kommentare** berechnen
df_comments["full_text"] = df_comments["selftext"].fillna("")
df_comments[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments["full_text"].tolist()), index=df_comments.index
)

# 🔹 Ergebnisse anzeigen
print(f"✅ Sentiment-Analyse abgeschlossen: {len(df_posts)} Posts & {len(df_comments)} Kommentare bewertet.")


🚀 Verwende Gerät: cuda


🔍 Analysiere Sentiments: 100%|██████████| 49/49 [00:24<00:00,  1.99it/s]
🔍 Analysiere Sentiments: 100%|██████████| 2436/2436 [09:55<00:00,  4.09it/s]

✅ Sentiment-Analyse abgeschlossen: 1544 Posts & 77929 Kommentare bewertet.





## Clean

In [18]:
def clean_data(df_posts, df_comments, comment_threshold=500):
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])

    # 2. Fehlende Werte behandeln
    df_posts = df_posts.fillna('')
    df_comments = df_comments.fillna('')

    # 3. Entferne unnötige Spalten (title & selftext)
    df_posts = df_posts.drop(columns=["title", "selftext"], errors="ignore")
    df_comments = df_comments.drop(columns=["selftext"], errors="ignore")

    # 4. Entferne Nutzer mit übermäßigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    print(f"✅ Daten bereinigt: {df_comments.shape[0]} Kommentare übrig (nach Spam-Filter).")

    return df_posts, df_comments


In [19]:
df_posts_clean, df_comments_clean = clean_data(df_posts , df_comments, comment_threshold=500)

✅ Daten bereinigt: 72736 Kommentare übrig (nach Spam-Filter).


## Merge

In [20]:
posts = df_posts_clean.copy()
comments = df_comments_clean.copy()

In [32]:
# 🔹 **Relevante Spalten für den Merge**
posts = posts[["post_id", "crypto", "search_term", "subreddit", "author", "date", "time", "score", "full_text", "sentiment", "sentiment_confidence"]]
comments = comments[["post_id", "comment_id", "author", "date", "time", "score", "full_text", "sentiment", "sentiment_confidence"]]

# 🔹 **Kommentare erben `crypto`, `search_term` und `subreddit` vom Post**
comments = comments.merge(df_posts[["post_id", "crypto", "search_term", "subreddit"]], on="post_id", how="left")

# 🔹 `type`-Spalte für Unterscheidung hinzufügen
posts["comment_id"] = None  # Posts haben keine comment_id
posts["type"] = "post"
comments["type"] = "comment"

# 🔹 **Gemeinsame Spalten für den Merge**
common_columns = [
    "post_id", "comment_id", "type", "date", "time", "crypto",
    "search_term", "subreddit", "author","full_text","score", "sentiment", "sentiment_confidence",
]

# 🔹 **Merging der Daten (Posts + Kommentare)**
df_merged = pd.concat([posts[common_columns], comments[common_columns]], ignore_index=True)

# 🔍 Debugging: Überprüfung der Größe
print(f"📌 Merged Dataset: {df_merged.shape[0]} Einträge (Posts + Kommentare)")

# 🔹 Überprüfen, ob alles korrekt normalisiert wurde
print(df_merged.head())

📌 Merged Dataset: 74280 Einträge (Posts + Kommentare)
   post_id comment_id  type        date      time            crypto  \
0  1i6072q       None  post  2025-01-20  20:30:24  Wrapped Ethereum   
1  1hgrkfc       None  post  2024-12-18  02:28:16  Wrapped Ethereum   
2  1hdkcxf       None  post  2024-12-13  19:37:53  Wrapped Ethereum   
3  1gvrhee       None  post  2024-11-20  15:03:02  Wrapped Ethereum   
4  1il9uwg       None  post  2025-02-09  07:58:21  Wrapped Ethereum   

        search_term       subreddit         author  \
0  wrapped ethereum  CryptoCurrency      kirtash93   
1  wrapped ethereum  CryptoCurrency   Archipelag0h   
2  wrapped ethereum  CryptoCurrency        tawhuac   
3  wrapped ethereum  CryptoCurrency  keepitahunned   
4              weth  CryptoCurrency    jcoffin1981   

                                           full_text  score sentiment  \
0  World Liberty Financial "Secretly" Accumulated...    526   bullish   
1  Ergo's 'Rosenbridge' to be connected to Nervo

In [26]:
df_posts_clean.head()

Unnamed: 0,post_id,crypto,search_term,subreddit,author,date,time,score,num_comments,full_text,sentiment,sentiment_confidence
0,1i6072q,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,kirtash93,2025-01-20,20:30:24,526,142,"World Liberty Financial ""Secretly"" Accumulated...",bullish,0.502826
1,1hgrkfc,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,Archipelag0h,2024-12-18,02:28:16,32,13,Ergo's 'Rosenbridge' to be connected to Nervos...,bullish,0.747164
2,1hdkcxf,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,tawhuac,2024-12-13,19:37:53,0,1,Question about old LUNA In any case - I don't ...,bearish,0.651479
3,1gvrhee,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,keepitahunned,2024-11-20,15:03:02,5,19,Is it possible to recover unsupported crypto s...,bullish,0.567725
4,1il9uwg,Wrapped Ethereum,weth,CryptoCurrency,jcoffin1981,2025-02-09,07:58:21,0,16,Metamask compromised? I staked Sushi/Weth abou...,bullish,0.558642


In [23]:
df_comments_clean.head()

Unnamed: 0,post_id,comment_id,author,date,time,score,full_text,sentiment,sentiment_confidence
0,1i6072q,m88752a,sadiq_238,2025-01-20,20:50:42,85,He also has the ETH he got from the ridiculous...,bullish,0.544401
2,1i6072q,m88iwiy,Klugenshmirtz,2025-01-20,21:42:29,47,I don't get how you can be the most powerful m...,neutral,0.544069
3,1i6072q,m883inr,Main_Law361,2025-01-20,20:34:35,35,Excuse me for my ignorance - why doesn’t the p...,neutral,0.805564
4,1i6072q,m88900e,SuccotashComplete,2025-01-20,20:58:45,39,$47 mil to represent the 47th president. Wasn’...,bullish,0.619801
5,1i6072q,m89ad23,GreedVault,2025-01-20,23:56:09,8,They sell the shitcoin they created and buy a ...,neutral,0.606686


In [33]:
df_merged.head()

Unnamed: 0,post_id,comment_id,type,date,time,crypto,search_term,subreddit,author,full_text,score,sentiment,sentiment_confidence
0,1i6072q,,post,2025-01-20,20:30:24,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,kirtash93,"World Liberty Financial ""Secretly"" Accumulated...",526,bullish,0.502826
1,1hgrkfc,,post,2024-12-18,02:28:16,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,Archipelag0h,Ergo's 'Rosenbridge' to be connected to Nervos...,32,bullish,0.747164
2,1hdkcxf,,post,2024-12-13,19:37:53,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,tawhuac,Question about old LUNA In any case - I don't ...,0,bearish,0.651479
3,1gvrhee,,post,2024-11-20,15:03:02,Wrapped Ethereum,wrapped ethereum,CryptoCurrency,keepitahunned,Is it possible to recover unsupported crypto s...,5,bullish,0.567725
4,1il9uwg,,post,2025-02-09,07:58:21,Wrapped Ethereum,weth,CryptoCurrency,jcoffin1981,Metamask compromised? I staked Sushi/Weth abou...,0,bullish,0.558642


## Export 

In [13]:
# # Setze den Pfad zu deinem Google Drive Ordner
# DRIVE_PATH = "G:/Meine Ablage/reddit/"
# POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
# COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
# MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")
# ORIGINAL_POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
# ORIGINAL_COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

Funktion zum Export 

In [14]:
def append_to_csv(df_new, filename, key_column):
    """Hängt neue Daten an eine bestehende CSV an & entfernt Duplikate."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # Falls Datei existiert, alte Daten einlesen
        if os.path.exists(file_path):
            df_existing = pd.read_csv(file_path, sep="|", encoding="utf-8-sig", on_bad_lines="skip")
            
            # 🔹 Daten zusammenführen & Duplikate nach `key_column` entfernen (neuere Werte behalten)
            df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=[key_column], keep="last")
        else:
            df_combined = df_new  # Falls keine Datei existiert, neue Daten direkt nutzen

        # 🔹 CSV speichern
        df_combined.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"✅ Datei erfolgreich aktualisiert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_to_drive(df_posts_clean, df_comments_clean, df_merged,df_posts, df_comments):
    """Speichert Posts, Kommentare & die gemergte Datei mit Duplikat-Prüfung."""
    try:
        append_to_csv(df_posts_clean, "reddit_posts.csv", key_column="post_id")
        append_to_csv(df_comments_clean, "reddit_comments.csv", key_column="comment_id")
        append_to_csv(df_merged, "reddit_merged.csv", key_column="comment_id")  # Falls Kommentare entscheidend sind
        append_to_csv(df_posts, "reddit_posts_original.csv", key_column="post_id")
        append_to_csv(df_comments, "reddit_comments_original.csv", key_column="comment_id")
    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [15]:
# # 🔹 Export-Funktion aufrufen
# export_to_drive(df_posts_clean, df_comments_clean, df_merged,df_posts, df_comments)