### Luftqualität Database upload

In [1]:
CSV_PATH = "../DataSourceLayer/Data/Schadstoff.csv"        
COLLECTION_NAME = "Schadstoff"  

In [2]:
from pymongo import MongoClient, ReplaceOne
from pymongo.errors import BulkWriteError
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
import pandas as pd
import os
import re

In [3]:
load_dotenv()

mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST", "localhost")
mongo_port = os.getenv("MONGO_PORT", "27017")
mongo_db   = os.getenv("MONGO_DB")

if not all([mongo_user, mongo_pass, mongo_host, mongo_port, mongo_db]):
    raise ValueError("Mongo ENV Variablen unvollständig. Bitte .env prüfen: MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT, MONGO_DB")

uri = f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/?authSource=admin"
client = MongoClient(uri)
db = client[mongo_db]

db.command("ping")
print("MongoDB Verbindung OK:", mongo_host, mongo_port, "DB:", mongo_db)

MongoDB Verbindung OK: localhost 27017 DB: LuftqualitaetDB


In [4]:
def load_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        sep=";",
        header=0,
        encoding="utf-8",
        decimal=",",
        thousands=".",
        low_memory=False
    )
    return df


In [5]:
def insert_df(collection_name: str, df: pd.DataFrame, batch_size: int = 5000, add_meta: bool = True) -> int:
    col = db[collection_name]

    # Optional: Meta-Felder pro Dokument
    meta = {"_imported_at": datetime.utcnow()} if add_meta else {}

    records = df.to_dict(orient="records")
    if meta:
        for r in records:
            r.update(meta)

    inserted = 0
    for i in tqdm(range(0, len(records), batch_size), desc=f"Insert -> {collection_name}"):
        chunk = records[i:i+batch_size]
        if not chunk:
            continue
        res = col.insert_many(chunk, ordered=False)
        inserted += len(res.inserted_ids)

    return inserted

In [6]:

     
BATCH_SIZE = 5000

df = load_csv(CSV_PATH)
print("CSV geladen. Shape:", df.shape)
display(df.head())

CSV geladen. Shape: (17630, 9)


Unnamed: 0,Region,Schadstoff,Einheit,NFR_Code,Trendbericht_Sektor,Quelle,Datenstand,Jahr,Werte
0,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1990,"215.855,04"
1,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1991,"225.617,42"
2,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1992,"213.983,87"
3,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1993,"208.070,23"
4,AT,NOX,t,0,Gesamt,OLI 2024 (1990-2023),45703,1994,"199.885,73"


In [7]:
n = insert_df(COLLECTION_NAME, df, batch_size=BATCH_SIZE, add_meta=True)
print(f"Insgesamt {n} Dokumente in Collection '{COLLECTION_NAME}' eingefügt.")

Insert -> Schadstoff: 100%|██████████| 4/4 [00:00<00:00,  9.86it/s]

Insgesamt 17630 Dokumente in Collection 'Schadstoff' eingefügt.



