# Reddit Scraper f√ºr vergangene Daten 

## Import 

In [2]:
import praw
import pandas as pd
from datetime import datetime, timedelta
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch


In [3]:
# Lade die .env-Datei
dotenv_loaded = load_dotenv("zugang_reddit.env")  # Falls die Datei anders hei√üt, anpassen
# Pr√ºfe, ob die Datei geladen wurde
print(f".env geladen? {dotenv_loaded}")


.env geladen? True


In [4]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

print("Reddit API erfolgreich verbunden!")


Reddit API erfolgreich verbunden!


In [5]:
try:
    subreddit = reddit.subreddit("CryptoCurrency")
    for post in subreddit.hot(limit=5):
        print(f"Title: {post.title}, Score: {post.score}")
except Exception as e:
    print(f"Fehler beim Abrufen der Subreddit-Daten: {e}")


Title: Daily Crypto Discussion - February 7, 2025 (GMT+0), Score: 26
Title: Class of 2025: My Portfolio is Down 50% | Class of 2017: Your Portfolio is Down 50% so Far, Score: 632
Title: This 20-year-old scammed someone of 4,100 BTC ($402M) and then bought 31 supercars, $2M watch, spent $569k in one night at a club, also gave away 5 Hermes Birkin bags to random ladies at the club., Score: 6359
Title: Trader's life..., Score: 58
Title: Finding Out About Crypto Ruined My Life, Score: 864


Cryptos und Subreddits 

In [6]:
cryptos = [
    ("Ethereum", "ETH"),
    ("Solana", "SOL"),
    ("Avalanche", "AVAX"),
    ("Polkadot", "DOT"),
    ("Near Protocol", "NEAR"),
    ("Polygon", "MATIC"),
    ("XRP", "XRP"),
    ("Cardano", "ADA"),
    ("Chainlink", "LINK")
]


In [7]:
subreddits = [
    "CryptoCurrency",  # Allgemeine Diskussionen √ºber Kryptow√§hrungen
    "CryptoMarkets",   # Diskussionen √ºber den Kryptomarkt und Preisbewegungen
    "CryptoTrading",   # Fokus auf Trading-Strategien und Analysen
    "Altcoin",         # Diskussionen √ºber Altcoins (alle Kryptow√§hrungen au√üer Bitcoin)
    "DeFi",            # Decentralized Finance (DeFi) und Projekte
    "BitcoinBeginners",# F√ºr Anf√§nger in der Krypto-Welt
    "cryptotechnology", # Fokus auf die zugrunde liegende Blockchain-Technologie
    "cryptocurrencies", # Allgemeine Diskussionen √ºber Kryptow√§hrungen
    "Satoshistreetsbets", # Krypto-Wetten und Spekulationen
    "Binance"        # Diskussionen √ºber die Binance-Plattform  
]

## Scraping 

Scraping Funktionen

In [None]:
# Funktion, um Posts und Kommentare gezielt zu Kryptow√§hrungen zu scrapen
def scrape_reddit(start_date, end_date, mode="initial"):
    start_timestamp = int(start_date.timestamp())  # Umwandlung in Unix-Zeit
    end_timestamp = int(end_date.timestamp())  # Umwandlung in Unix-Zeit

    posts = []
    comments = []
    post_ids = set()  # Vermeidung doppelter Post-IDs

    for crypto_name, crypto_symbol in cryptos:
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"Suche nach {crypto_name} ({crypto_symbol}) in r/{subreddit_name}...")

            # üîπ Suchbegriffe in Kleinbuchstaben umwandeln
            search_terms = [crypto_name.lower(), crypto_symbol.lower()]

            for search_term in search_terms:
                for post in subreddit.search(query=search_term, sort="new", limit=None):
                    if start_timestamp <= post.created_utc <= end_timestamp and post.id not in post_ids:
                        post_ids.add(post.id)

                        # üîπ Titel & Selbsttext in Kleinbuchstaben f√ºr case-insensitive Vergleich
                        post_title = post.title.lower()
                        post_selftext = (post.selftext or "").lower()

                        posts.append({
                            'crypto': crypto_name,
                            'search_term': search_term,
                            'subreddit': subreddit_name,
                            'post_id': post.id,
                            'title': post_title,
                            'author': str(post.author),
                            'created_utc': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            'score': post.score,
                            'num_comments': post.num_comments,
                            'selftext': post_selftext
                        })

                        print(f"Post gefunden: {post_title} (Suchbegriff: {search_term})")

                        # üîπ Kommentare sammeln
                        post.comments.replace_more(limit=0)
                        for comment in post.comments.list():
                            comments.append({
                                'post_id': post.id,
                                'comment_id': comment.id,
                                'author': str(comment.author),
                                'created_utc': datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                                'score': comment.score,
                                'body': (comment.body or "").lower()  # üîπ Case-Insensitive Kommentartext
                            })

    # In DataFrames umwandeln
    df_posts = pd.DataFrame(posts)
    df_comments = pd.DataFrame(comments)

    print(f"Scrape abgeschlossen: {len(df_posts)} Posts und {len(df_comments)} Kommentare gefunden.")
    return df_posts, df_comments

# Einmaliger Scrape f√ºr die letzten 3 Monate
start_of_period = datetime(2024, 11, 1)  # 1. November 2024 als `datetime`
now = datetime.now()  # Aktuelle Zeit als `datetime`
print("Starte den einmaligen Scrape f√ºr die letzten 3 Monate...")
df_posts_initial, df_comments_initial = scrape_reddit(start_of_period, now, mode="initial")

print("Daten k√∂nnen jetzt in der Pipeline bereinigt werden...")


Starte den einmaligen Scrape f√ºr die letzten 3 Monate...
Suche nach Ethereum (ETH) in r/CryptoCurrency...
Post gefunden: Ethereum‚Äôs Pectra Upgrade Testing Begins in February (Suchbegriff: Ethereum)
Post gefunden: Ethereum Faces 'Intense' Competition From Other Networks: JPMorgan (Suchbegriff: Ethereum)
Post gefunden: Best Crypto Based on Priority (Suchbegriff: Ethereum)
Post gefunden: best crypto coins to hold for 10 years? (Suchbegriff: Ethereum)
Post gefunden: Galaxy‚Äôs Novogratz calls Ethereum sentiment ‚Äòunbelievably bearish‚Äô as ETH price struggles to keep pace (Suchbegriff: Ethereum)
Post gefunden: I sent ETH to wrong network, am I screwed? (Suchbegriff: Ethereum)
Post gefunden: JPMorgan says Ethereum likely to keep facing 'intense competition' from other networks (Suchbegriff: Ethereum)
Post gefunden: Ethereum lied on the resume but still got the job (Suchbegriff: Ethereum)
Post gefunden: Grayscale Ethereum trust sees US$7.2 million outflow, impacting Ethereum prices (Such

## Clean

In [9]:
def clean_data(df_posts, df_comments, comment_threshold=500):# Anpassbarer Schwellenwert f√ºr Kommentare pro Nutzer
    # 1. Duplikate entfernen
    df_posts = df_posts.drop_duplicates(subset=["post_id"])
    df_comments = df_comments.drop_duplicates(subset=["comment_id"])
    
    # 2. Fehlende Werte behandeln
    df_posts['selftext'] = df_posts['selftext'].fillna('')  # Fehlende Posttexte auff√ºllen
    df_comments['body'] = df_comments['body'].fillna('')  # Fehlende Kommentare auff√ºllen
    
    # 3. Zeitstempel konvertieren
    df_posts['created_utc'] = pd.to_datetime(df_posts['created_utc'])
    df_comments['created_utc'] = pd.to_datetime(df_comments['created_utc'])

    # 4. Datum & Uhrzeit in separate Spalten aufteilen (Daten normalisieren)
    df_posts["date"] = df_posts["created_utc"].dt.date  # YYYY-MM-DD
    df_posts["time"] = df_posts["created_utc"].dt.time  # HH:MM:SS

    df_comments["date"] = df_comments["created_utc"].dt.date
    df_comments["time"] = df_comments["created_utc"].dt.time

    # 5. Original `created_utc`-Spalte entfernen
    df_posts.drop(columns=["created_utc"], inplace=True)
    df_comments.drop(columns=["created_utc"], inplace=True)

    # 6. Filterung nach Qualit√§t (Spam oder irrelevante Daten entfernen)
    df_posts = df_posts[df_posts['score'] > 0]  # Posts mit negativem Score entfernen
    df_comments = df_comments[df_comments['score'] > 0]  # Kommentare mit negativem Score entfernen

    # 7. Entferne bekannte Bot-Accounts
    bot_accounts = ["AutoModerator", "coinfeeds-bot", "devCheckingIn"]
    df_comments = df_comments[~df_comments["author"].isin(bot_accounts)]

    # 8. Entferne Nutzer mit √ºberm√§√üigen Kommentaren
    comment_counts = df_comments["author"].value_counts()
    frequent_users = comment_counts[comment_counts > comment_threshold].index  # Nutzer √ºber Grenze
    df_comments = df_comments[~df_comments["author"].isin(frequent_users)]

    print(f"Daten bereinigt: {df_comments.shape[0]} Kommentare √ºbrig (nach Spam-Filter).")

    return df_posts, df_comments


In [12]:
# Bereinigen der Daten
df_posts_clean, df_comments_clean = clean_data(df_posts_initial, df_comments_initial, comment_threshold=300) # Anpassbarer Schwellenwert f√ºr Kommentare pro Nutzer


# √úberpr√ºfen, wie viele Eintr√§ge √ºbrig sind
print(f"Bereinigte Posts: {len(df_posts_clean)}")
print(f"Bereinigte Kommentare: {len(df_comments_clean)}")

Daten bereinigt: 105409 Kommentare √ºbrig (nach Spam-Filter).
Bereinigte Posts: 1673
Bereinigte Kommentare: 105409


## Model fuer das Sentiment 

In [13]:
# Neues Modell laden
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# üîπ Sicherstellen, dass die Spalte "body" existiert
if "body" not in df_comments_clean.columns:
    raise ValueError("Fehler: Die CSV-Datei enth√§lt keine 'body'-Spalte mit Kommentaren!")

#  Funktion zur Sentiment-Analyse
def analyze_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", 0.0  # Leere Kommentare sind neutral

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    scores = F.softmax(outputs.logits, dim=1)[0]
    labels = ["negative", "neutral", "positive"]
    sentiment = labels[torch.argmax(scores).item()]
    confidence = scores.max().item()

    return sentiment, confidence

# üîπ Sentiment f√ºr alle Kommentare berechnen
df_comments_clean["sentiment"], df_comments_clean["sentiment_confidence"] = zip(*df_comments_clean["body"].map(analyze_sentiment))

## Merge 

In [14]:
# üîπ Merging der Posts & Kommentare direkt nach der Bereinigung
df_merged = df_comments_clean.merge(df_posts_clean, on="post_id", how="left")

# üîπ Fehlende Werte entfernen (optional)
df_merged.dropna(inplace=True)

In [15]:
df_merged.head()

Unnamed: 0,post_id,comment_id,author_x,score_x,body,date_x,time_x,sentiment,sentiment_confidence,crypto,search_term,subreddit,title,author_y,score_y,num_comments,selftext,date_y,time_y
0,1ijtwva,mbh1jiy,Clear_Item_922,1,Will it actually make Etherium go up?,2025-02-07,13:17:55,neutral,0.865525,Ethereum,Ethereum,CryptoCurrency,Ethereum‚Äôs Pectra Upgrade Testing Begins in Fe...,Afonsoo99,7.0,4.0,,2025-02-07,12:31:01
1,1ijtwva,mbh1ln2,galacticwyandotte,1,Thank god!,2025-02-07,13:18:17,positive,0.928441,Ethereum,Ethereum,CryptoCurrency,Ethereum‚Äôs Pectra Upgrade Testing Begins in Fe...,Afonsoo99,7.0,4.0,,2025-02-07,12:31:01
2,1ijsxbf,mbgscjz,apairofjacks,5,No shit,2025-02-07,12:14:30,negative,0.712431,Ethereum,Ethereum,CryptoCurrency,Ethereum Faces 'Intense' Competition From Othe...,Every_Hunt_160,19.0,22.0,,2025-02-07,11:30:14
3,1ijsxbf,mbgwxg1,Puddingbuks26,2,no shit!,2025-02-07,12:47:36,negative,0.749225,Ethereum,Ethereum,CryptoCurrency,Ethereum Faces 'Intense' Competition From Othe...,Every_Hunt_160,19.0,22.0,,2025-02-07,11:30:14
4,1ijsxbf,mbhdnth,Brodie266,1,They have to find a way to lower the gas fee,2025-02-07,14:28:39,neutral,0.7923,Ethereum,Ethereum,CryptoCurrency,Ethereum Faces 'Intense' Competition From Othe...,Every_Hunt_160,19.0,22.0,,2025-02-07,11:30:14


## Export 

In [16]:
# Setze den Pfad zu deinem Google Drive Ordner
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments.csv")
MERGED_CSV = os.path.join(DRIVE_PATH, "reddit_merged.csv")

Funktion zum Export 

In [17]:
def save_initial_csv(df_new, filename):
    """Speichert die erste CSV-Datei ohne Anh√§ngen oder Duplikat-Pr√ºfung."""
    file_path = os.path.join(DRIVE_PATH, filename)

    try:
        # üîπ Falls die Datei existiert, sie komplett √ºberschreiben
        df_new.to_csv(
            file_path,
            index=False,
            sep="|",
            encoding="utf-8-sig",
            lineterminator="\n"
        )
        print(f"‚úÖ Datei erfolgreich gespeichert: {file_path}")

    except Exception as e:
        print(f"Fehler beim Speichern der Datei {filename}: {e}")

def export_initial_data(df_posts, df_comments, df_merged):
    """Speichert die initialen Posts, Kommentare & gemergten Daten."""
    try:
        save_initial_csv(df_posts, "reddit_posts.csv")
        save_initial_csv(df_comments, "reddit_comments.csv")
        save_initial_csv(df_merged, "reddit_merged.csv")

    except Exception as e:
        print(f"Fehler beim Export: {e}")

In [18]:
# üîπ Export-Funktion f√ºr den ersten Scrape aufrufen
export_initial_data(df_posts_clean, df_comments_clean, df_merged)

‚úÖ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_posts.csv
‚úÖ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_comments.csv
‚úÖ Datei erfolgreich gespeichert: G:/Meine Ablage/reddit/reddit_merged.csv
