In [1]:
import os
os.environ["TRANSFORMERS_NO_TF_IMPORT"] = "1"
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cuda pruefen
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


Model

In [3]:
# üìå Pfade zu den CSV-Dateien
DRIVE_PATH = "G:/Meine Ablage/reddit/"
POSTS_CSV = os.path.join(DRIVE_PATH, "reddit_posts_rohdaten.csv")
COMMENTS_CSV = os.path.join(DRIVE_PATH, "reddit_comments_rohdaten.csv")

# üîπ **Funktion zum Laden der CSV-Dateien**
def load_csv(filepath):
    """L√§dt eine CSV-Datei mit `|` als Trennzeichen und Debugging-Infos"""
    if not os.path.exists(filepath):
        print(f"‚ùå Datei nicht gefunden: {filepath}")
        return pd.DataFrame()

    df = pd.read_csv(filepath, sep="|", encoding="utf-8-sig", on_bad_lines="skip")

    print(f"\nüìå Datei geladen: {filepath}")
    print(f"üîπ Spalten: {df.columns.tolist()}")
    print(df.dtypes)
    print(df.head())

    return df

# üìå **Daten laden**
df_posts = load_csv(POSTS_CSV)
df_comments = load_csv(COMMENTS_CSV)


üìå Datei geladen: G:/Meine Ablage/reddit/reddit_posts_rohdaten.csv
üîπ Spalten: ['post_id', 'crypto', 'search_term', 'subreddit', 'title', 'author', 'date', 'time', 'score', 'num_comments', 'selftext']
post_id         object
crypto          object
search_term     object
subreddit       object
title           object
author          object
date            object
time            object
score            int64
num_comments     int64
selftext        object
dtype: object
   post_id   crypto search_term       subreddit  \
0  1j0dz4r  Bitcoin     bitcoin  CryptoCurrency   
1  1j0dghm  Bitcoin     bitcoin  CryptoCurrency   
2  1j0cx0g  Bitcoin     bitcoin  CryptoCurrency   
3  1j0cnlh  Bitcoin     bitcoin  CryptoCurrency   
4  1j08i0y  Bitcoin     bitcoin  CryptoCurrency   

                                               title               author  \
0  BlackRock Adds Its Bitcoin ETF to Model Portfo...             diwalost   
1  x-post: As the free-float of coins is low, is ...        3fkgf9

In [4]:
df_posts.date.max(), df_posts.date.min()

('2025-02-28', '2024-11-01')

In [10]:
# üîÑ 2Ô∏è‚É£ Relevante Spalten extrahieren
df_posts["full_text"] = df_posts["title"].fillna("") + " " + df_posts["selftext"].fillna("")
df_comments["full_text"] = df_comments["selftext"].fillna("")

# ‚ùå Entferne leere Zeilen
df_posts = df_posts[df_posts["full_text"].str.strip() != ""]
df_comments = df_comments[df_comments["full_text"].str.strip() != ""]

# Entferne sehr kurze Texte (< 10 Zeichen) und doppelte Eintr√§ge
df_posts = df_posts[df_posts["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])
df_comments = df_comments[df_comments["full_text"].str.len() > 10].drop_duplicates(subset=["full_text"])

# üîÑ 3Ô∏è‚É£ Posts & Kommentare kombinieren
df = pd.concat([df_posts[["full_text"]], df_comments[["full_text"]]], ignore_index=True)

print(f"‚úÖ Bereinigung abgeschlossen. {len(df)} Eintr√§ge gespeichert in reddit_cleaned.csv")


‚úÖ Bereinigung abgeschlossen. 334346 Eintr√§ge gespeichert in reddit_cleaned.csv


In [11]:
# üîπ Stichprobe ziehen (z. B. 20.000 Eintr√§ge)
df_sample = df.sample(n=20000, random_state=42)

# üîπ In Hugging Face Dataset umwandeln
dataset = Dataset.from_pandas(df_sample)

In [12]:
# üî† 2Ô∏è‚É£ Tokenizer laden (BERT als Basis f√ºr CryptoBERT)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenisierung mit Padding & Truncation
def tokenize_function(examples):
    return tokenizer(examples["full_text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["full_text"])

# üîπ 3Ô∏è‚É£ Masked Language Modeling (MLM) vorbereiten
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# üîπ 4Ô∏è‚É£ CryptoBERT-Modell laden (auf BERT-Basis)
model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)

# üîπ 5Ô∏è‚É£ Trainingseinstellungen
training_args = TrainingArguments(
    output_dir="./cryptoBERT-posttrained",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none"
)

# üîπ 6Ô∏è‚É£ Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# üî• 7Ô∏è‚É£ Training starten
trainer.train()

# üìå 8Ô∏è‚É£ Modell speichern (nach jeder Epoche wird es automatisch gespeichert)
trainer.save_model("./cryptoBERT-posttrained-final")
print("‚úÖ Training abgeschlossen. Modell gespeichert!")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:10<00:00, 1847.91 examples/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.653,2.427955
2,2.3926,2.259322


‚úÖ Training abgeschlossen. Modell gespeichert!


In [15]:
import torch
import math
from transformers import BertTokenizer, BertForMaskedLM

# üìå Lade das trainierte CryptoBERT-Modell
crypto_model_path = "cryptoBERT-posttrained-final"
crypto_model = BertForMaskedLM.from_pretrained(crypto_model_path).eval()

# üìå Lade das urspr√ºngliche BERT-Modell zum Vergleich
base_model = BertForMaskedLM.from_pretrained("bert-base-uncased").eval()

# üìå Tokenizer laden
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# üìå Beispieltext aus den Reddit-Daten
text = "Bitcoin is pumping üöÄ. The market is looking bullish today."

# üî† Tokenisierung
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# üî• Perplexity-Funktion
def calculate_perplexity(model, input_ids):
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        ppl = math.exp(loss.item())
        return ppl

# üìä Berechnung des PPL-Scores f√ºr beide Modelle
crypto_ppl = calculate_perplexity(crypto_model, input_ids)
base_ppl = calculate_perplexity(base_model, input_ids)

print(f"üìä Perplexity Score (Original BERT): {base_ppl:.2f}")
print(f"üöÄ Perplexity Score (CryptoBERT trainiert): {crypto_ppl:.2f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


üìä Perplexity Score (Original BERT): 8.73
üöÄ Perplexity Score (CryptoBERT trainiert): 6.86


In [20]:
import torch
import math
from transformers import AutoTokenizer, AutoModelForMaskedLM

# üìå Modelle direkt von Hugging Face laden
base_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased").eval()
crypto_bert_base = AutoModelForMaskedLM.from_pretrained("ElKulako/cryptobert").eval()
crypto_trained_model = AutoModelForMaskedLM.from_pretrained("cryptoBERT-posttrained-final").eval()

# üìå Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# üìå Beispieltext aus Reddit (Krypto-Sprache)
text = "Bitcoin is pumping üöÄ. The market is looking bullish today."

# üî† Tokenisierung
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# üî• Perplexity-Funktion
def calculate_perplexity(model, input_ids):
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        ppl = math.exp(loss.item())
        return ppl

# üìä Berechnung des PPL-Scores f√ºr alle Modelle
ppl_base = calculate_perplexity(base_model, input_ids)
ppl_crypto_bert = calculate_perplexity(crypto_bert_base, input_ids)
ppl_trained = calculate_perplexity(crypto_trained_model, input_ids)

print(f"üìä Perplexity Score (Original BERT): {ppl_base:.2f}")
print(f"üìä Perplexity Score (CryptoBERT von Hugging Face - ElKulako): {ppl_crypto_bert:.2f}")
print(f"üöÄ Perplexity Score (Unser trainiertes CryptoBERT): {ppl_trained:.2f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ElKulako/cryptobert and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model o

üìä Perplexity Score (Original BERT): 8.73
üìä Perplexity Score (CryptoBERT von Hugging Face - ElKulako): 2166877.11
üöÄ Perplexity Score (Unser trainiertes CryptoBERT): 6.86


In [21]:
from transformers import AutoModel, AutoTokenizer

# üìå Modell & Tokenizer laden
MODEL_NAME = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# üìå Modellarchitektur √ºberpr√ºfen
print(f"‚úÖ Modell erfolgreich geladen: {MODEL_NAME}")
print(f"üîç Modellarchitektur: {model.config.architectures}")
print(f"üî¢ Anzahl der Modellparameter: {model.num_parameters()}")


Some weights of RobertaModel were not initialized from the model checkpoint at ElKulako/cryptobert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Modell erfolgreich geladen: ElKulako/cryptobert
üîç Modellarchitektur: ['RobertaForSequenceClassification']
üî¢ Anzahl der Modellparameter: 124645632
