In [19]:
import os
import pandas as pd
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI
from praw.exceptions import APIException

# 📌 Pfade zu den CSV-Dateien
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

# 🔹 **Funktion zum Laden der CSV-Dateien**
def load_csv(filepath):
    """Lädt eine CSV-Datei mit `|` als Trennzeichen und Debugging-Infos"""
    if not os.path.exists(filepath):
        print(f"❌ Datei nicht gefunden: {filepath}")
        return pd.DataFrame()

    df = pd.read_csv(filepath, sep="|", encoding="utf-8-sig", on_bad_lines="skip")

    print(f"\n📌 Datei geladen: {filepath}")
    print(f"🔹 Spalten: {df.columns.tolist()}")
    print(df.dtypes)
    print(df.head())

    return df

# 📌 **Daten laden**
df_posts = load_csv(POSTS_CSV)
df_comments = load_csv(COMMENTS_CSV)



📌 Datei geladen: G:/Meine Ablage/reddit/reddit_posts_original.csv
🔹 Spalten: ['post_id', 'crypto', 'search_term', 'subreddit', 'title', 'author', 'date', 'time', 'score', 'num_comments', 'selftext']
post_id         object
crypto          object
search_term     object
subreddit       object
title           object
author          object
date            object
time            object
score            int64
num_comments     int64
selftext        object
dtype: object
   post_id    crypto search_term       subreddit  \
0  1irsm07  Ethereum    ethereum  CryptoCurrency   
1  1irr9h8  Ethereum    ethereum  CryptoCurrency   
2  1ir3avz  Ethereum    ethereum  CryptoCurrency   
3  1ir2i1r  Ethereum    ethereum  CryptoCurrency   
4  1ir0uz0  Ethereum    ethereum  CryptoCurrency   

                                               title                author  \
0  Urgent Ethereum Geth patch addresses Merge ove...             Afonsoo99   
1  Banking Giant JPMorgan Chase Holds $1,016,728 ...            

In [20]:

# Erstelle die "full_text"-Spalte für Posts & Kommentare
df_posts["full_text"] = df_posts["title"] + " " + df_posts["selftext"].fillna("")
df_comments["full_text"] = df_comments["selftext"].fillna("")

# Entferne sehr kurze Texte (< 10 Zeichen) und doppelte Einträge
df_posts = df_posts[df_posts["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])
df_comments = df_comments[df_comments["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])

print(f"✅ Daten aufbereitet: {len(df_posts)} Posts & {len(df_comments)} Kommentare übrig.")


✅ Daten aufbereitet: 6754 Posts & 264029 Kommentare übrig.


In [21]:

# 🔹 GPU nutzen, falls verfügbar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results




🚀 Verwende Gerät: cuda


In [22]:
# Sentiment für Posts berechnen
df_posts[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts["full_text"].tolist()), index=df_posts.index
)

# Sentiment für Kommentare berechnen
df_comments[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments["full_text"].tolist()), index=df_comments.index
)

print("✅ Sentiment-Analyse abgeschlossen!")


🔍 Analysiere Sentiments: 100%|██████████| 212/212 [01:54<00:00,  1.85it/s]
🔍 Analysiere Sentiments: 100%|██████████| 8251/8251 [42:59<00:00,  3.20it/s]  

✅ Sentiment-Analyse abgeschlossen!





In [9]:
import torch 

if torch.cuda.is_available():
    print("GPU available")
else:
    print("GPU not available")

if torch.cuda.device_count() > 0:
    print("GPU is available:", torch.cuda.get_device_name(0))
else:
    print("GPU not available")

GPU available
GPU is available: NVIDIA RTX A5000 Laptop GPU
