In [19]:
import os
import pandas as pd
import praw
import pandas as pd
from datetime import datetime, timedelta, timezone
import os
import psaw as ps
from dotenv import load_dotenv
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import torch
from tqdm import tqdm
from psaw import PushshiftAPI
from praw.exceptions import APIException

# 📌 Pfade zu den CSV-Dateien
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_original.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_original.csv")

# 🔹 **Funktion zum Laden der CSV-Dateien**
def load_csv(filepath):
    """Lädt eine CSV-Datei mit `|` als Trennzeichen und Debugging-Infos"""
    if not os.path.exists(filepath):
        print(f"❌ Datei nicht gefunden: {filepath}")
        return pd.DataFrame()

    df = pd.read_csv(filepath, sep="|", encoding="utf-8-sig", on_bad_lines="skip")

    print(f"\n📌 Datei geladen: {filepath}")
    print(f"🔹 Spalten: {df.columns.tolist()}")
    print(df.dtypes)
    print(df.head())

    return df

# 📌 **Daten laden**
df_posts = load_csv(POSTS_CSV)
df_comments = load_csv(COMMENTS_CSV)



📌 Datei geladen: G:/Meine Ablage/reddit/reddit_posts_original.csv
🔹 Spalten: ['post_id', 'crypto', 'search_term', 'subreddit', 'title', 'author', 'date', 'time', 'score', 'num_comments', 'selftext']
post_id         object
crypto          object
search_term     object
subreddit       object
title           object
author          object
date            object
time            object
score            int64
num_comments     int64
selftext        object
dtype: object
   post_id    crypto search_term       subreddit  \
0  1irsm07  Ethereum    ethereum  CryptoCurrency   
1  1irr9h8  Ethereum    ethereum  CryptoCurrency   
2  1ir3avz  Ethereum    ethereum  CryptoCurrency   
3  1ir2i1r  Ethereum    ethereum  CryptoCurrency   
4  1ir0uz0  Ethereum    ethereum  CryptoCurrency   

                                               title                author  \
0  Urgent Ethereum Geth patch addresses Merge ove...             Afonsoo99   
1  Banking Giant JPMorgan Chase Holds $1,016,728 ...            

In [20]:

# Erstelle die "full_text"-Spalte für Posts & Kommentare
df_posts["full_text"] = df_posts["title"] + " " + df_posts["selftext"].fillna("")
df_comments["full_text"] = df_comments["selftext"].fillna("")

# Entferne sehr kurze Texte (< 10 Zeichen) und doppelte Einträge
df_posts = df_posts[df_posts["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])
df_comments = df_comments[df_comments["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])

print(f"✅ Daten aufbereitet: {len(df_posts)} Posts & {len(df_comments)} Kommentare übrig.")


✅ Daten aufbereitet: 6754 Posts & 264029 Kommentare übrig.


In [21]:

# 🔹 GPU nutzen, falls verfügbar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Verwende Gerät: {device}")

# 🔹 CryptoBERT-Modell laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()  # Setzt das Modell in den Evaluationsmodus

# 🔹 Funktion zur Sentiment-Analyse (Optimiert für Batch-Prozesse)
def analyze_sentiment_batch(texts, batch_size=32):
    """Effiziente GPU-gestützte Sentiment-Analyse mit CryptoBERT für eine Liste von Texten."""
    results = []

    # Ersetze leere Einträge durch "neutral"
    texts = [t if isinstance(t, str) and t.strip() != "" else "neutral" for t in texts]

    # Batchweise Verarbeitung
    for i in tqdm(range(0, len(texts), batch_size), desc="🔍 Analysiere Sentiments"):
        batch_texts = texts[i : i + batch_size]

        # Tokenisierung (mit Padding für Performance)
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)

        # Vorhersage mit CryptoBERT
        with torch.no_grad():
            outputs = model(**inputs)

        scores = F.softmax(outputs.logits, dim=1)
        labels = ["bearish", "neutral", "bullish"] 

        # Ergebnisse speichern
        for i in range(len(batch_texts)):
            sentiment = labels[torch.argmax(scores[i]).item()]
            confidence = scores[i].max().item()
            results.append((sentiment, confidence))

    return results




🚀 Verwende Gerät: cuda


In [22]:
# Sentiment für Posts berechnen
df_posts[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_posts["full_text"].tolist()), index=df_posts.index
)

# Sentiment für Kommentare berechnen
df_comments[["sentiment", "sentiment_confidence"]] = pd.DataFrame(
    analyze_sentiment_batch(df_comments["full_text"].tolist()), index=df_comments.index
)

print("✅ Sentiment-Analyse abgeschlossen!")


🔍 Analysiere Sentiments: 100%|██████████| 212/212 [01:54<00:00,  1.85it/s]
🔍 Analysiere Sentiments: 100%|██████████| 8251/8251 [42:59<00:00,  3.20it/s]  

✅ Sentiment-Analyse abgeschlossen!





In [23]:
df_posts = df_posts[df_posts["sentiment_confidence"] >= 0.6]
df_comments = df_comments[df_comments["sentiment_confidence"] >= 0.6]

print(f"✅ Bereinigung abgeschlossen: {len(df_posts)} Posts & {len(df_comments)} Kommentare nach Filtering.")


✅ Bereinigung abgeschlossen: 3884 Posts & 156345 Kommentare nach Filtering.


In [28]:
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer

# Labels in numerische Werte umwandeln
label_mapping = {"bearish": 0, "neutral": 1, "bullish": 2}
df_posts["label"] = df_posts["sentiment"].map(label_mapping)
df_comments["label"] = df_comments["sentiment"].map(label_mapping)

# Trainings- und Validierungssets erstellen (80% Train, 20% Test)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_posts["full_text"].tolist(), df_posts["label"].tolist(), test_size=0.2, random_state=42
)

# Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")

# Tokenisierung der Daten
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

print("✅ Daten erfolgreich vorbereitet und tokenisiert!")


✅ Daten erfolgreich vorbereitet und tokenisiert!


In [29]:
class CryptoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Erstelle Trainings- und Validierungs-Datasets
train_dataset = CryptoDataset(train_encodings, train_labels)
val_dataset = CryptoDataset(val_encodings, val_labels)

print("✅ PyTorch Dataset erfolgreich erstellt!")


✅ PyTorch Dataset erfolgreich erstellt!


In [36]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# CryptoBERT mit drei Klassifikations-Labels laden
model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert", num_labels=3).to("cuda")

# Trainingsparameter
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2
)

print("✅ Modell und Trainingsparameter gesetzt!")


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [33]:
import torch 

if torch.cuda.is_available():
    print("GPU available")
else:
    print("GPU not available")

if torch.cuda.device_count() > 0:
    print("GPU is available:", torch.cuda.get_device_name(0))
else:
    print("GPU not available")

GPU available
GPU is available: NVIDIA RTX A5000 Laptop GPU


In [35]:
import torch
print("CUDA verfügbar:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)


CUDA verfügbar: True
CUDA Version: 12.1
