In [8]:
COLLECTION_NAME = "Schadstoff"        
OUT_CSV_PATH = "./export/ExportSchadstoff.csv"

In [9]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
import pandas as pd
from tqdm import tqdm

In [10]:
load_dotenv()

mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST", "localhost")
mongo_port = os.getenv("MONGO_PORT", "27017")
mongo_db   = os.getenv("MONGO_DB")

if not all([mongo_user, mongo_pass, mongo_host, mongo_port, mongo_db]):
    raise ValueError("Mongo ENV Variablen unvollständig. Bitte .env prüfen: MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT, MONGO_DB")

uri = f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/?authSource=admin"
client = MongoClient(uri)
db = client[mongo_db]

db.command("ping")
print("MongoDB Verbindung OK:", mongo_host, mongo_port, "DB:", mongo_db)

MongoDB Verbindung OK: localhost 27017 DB: LuftqualitaetDB


In [11]:
from typing import Optional, Dict, Any, List

def collection_to_df(
    collection_name: str,
    query: Optional[Dict[str, Any]] = None,
    projection: Optional[Dict[str, int]] = None,
    batch_size: int = 5000,
    limit: Optional[int] = None
) -> pd.DataFrame:
    col = db[collection_name]
    query = query or {}

    cursor = col.find(query, projection=projection, batch_size=batch_size)
    if limit is not None:
        cursor = cursor.limit(int(limit))

    rows: List[dict] = []
    for doc in tqdm(cursor, desc=f"Read -> {collection_name}"):
        # Mongo _id ist ein ObjectId → für CSV lieber als String
        if "_id" in doc:
            doc["_id"] = str(doc["_id"])
        rows.append(doc)

    return pd.DataFrame(rows)

In [12]:
def save_df_as_csv(
    df: pd.DataFrame,
    out_path: str,
    sep: str = ";",
    decimal: str = ",",
    encoding: str = "utf-8"
):
    # Hinweis: pandas schreibt numerische Werte ohne Tausenderpunkte.
    # Beim Öffnen in Excel wird decimal=',' berücksichtigt.
    df.to_csv(out_path, index=False, sep=sep, decimal=decimal, encoding=encoding)

In [13]:
BATCH_SIZE = 5000

# Optional:
QUERY = {}           # z.B. {"station_id": 123} oder leer lassen
PROJECTION = None    # z.B. {"_id": 1, "timestamp": 1, "pm10": 1} oder None
LIMIT = None         # z.B. 10000 für Test-Export

df = collection_to_df(
    COLLECTION_NAME,
    query=QUERY,
    projection=PROJECTION,
    batch_size=BATCH_SIZE,
    limit=LIMIT
)

print("Daten geladen. Shape:", df.shape)
display(df.head())

Read -> Schadstoff: 70520it [00:00, 84157.10it/s]


Daten geladen. Shape: (70520, 11)


Unnamed: 0,_id,Region,Schadstoff,Einheit,NFR_Code,Trendbericht_Sektor,Quelle,Datenstand,Jahr,Werte,_imported_at
0,695f9412440359c0b02efb97,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1990,"215.855,04",2026-01-08 11:25:05.978
1,695f9412440359c0b02efb98,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1991,"225.617,42",2026-01-08 11:25:05.978
2,695f9412440359c0b02efb99,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1992,"213.983,87",2026-01-08 11:25:05.978
3,695f9412440359c0b02efb9a,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1993,"208.070,23",2026-01-08 11:25:05.978
4,695f9412440359c0b02efb9b,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1994,"199.885,73",2026-01-08 11:25:05.978


In [14]:
import os
os.makedirs(os.path.dirname(OUT_CSV_PATH), exist_ok=True)

save_df_as_csv(df, OUT_CSV_PATH, sep=";", decimal=",")
print("Export gespeichert unter:", OUT_CSV_PATH)

Export gespeichert unter: ./export/ExportSchadstoff.csv
