### Luftqualität Database upload

In [8]:
CSV_PATH = "../DataSourceLayer/Data/VerkehrÖsterreich.csv"        
COLLECTION_NAME = "Verkehr2"  

In [9]:
from pymongo import MongoClient, ReplaceOne
from pymongo.errors import BulkWriteError
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
import pandas as pd
import os
import re

In [10]:
load_dotenv()

mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST", "localhost")
mongo_port = os.getenv("MONGO_PORT", "27017")
mongo_db   = os.getenv("MONGO_DB")

if not all([mongo_user, mongo_pass, mongo_host, mongo_port, mongo_db]):
    raise ValueError("Mongo ENV Variablen unvollständig. Bitte .env prüfen: MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT, MONGO_DB")

uri = f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/?authSource=admin"
client = MongoClient(uri)
db = client[mongo_db]

db.command("ping")
print("MongoDB Verbindung OK:", mongo_host, mongo_port, "DB:", mongo_db)

MongoDB Verbindung OK: localhost 27017 DB: LuftqualitaetDB


In [11]:
def load_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        sep=";",
        header=0,
        encoding="utf-8",
        decimal=",",
        thousands=".",
        low_memory=False
    )
    return df


In [12]:
def insert_df(collection_name: str, df: pd.DataFrame, batch_size: int = 5000, add_meta: bool = True) -> int:
    col = db[collection_name]

    # Optional: Meta-Felder pro Dokument
    meta = {"_imported_at": datetime.utcnow()} if add_meta else {}

    records = df.to_dict(orient="records")
    if meta:
        for r in records:
            r.update(meta)

    inserted = 0
    for i in tqdm(range(0, len(records), batch_size), desc=f"Insert -> {collection_name}"):
        chunk = records[i:i+batch_size]
        if not chunk:
            continue
        res = col.insert_many(chunk, ordered=False)
        inserted += len(res.inserted_ids)

    return inserted

In [13]:

     
BATCH_SIZE = 5000

df = load_csv(CSV_PATH)
print("CSV geladen. Shape:", df.shape)
display(df.head())

CSV geladen. Shape: (10, 29)


Unnamed: 0,Bundesland,1980,1990,2000,2001,2002,2003,2004,2005,2006,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,Unnamed: 28
0,Burgenland,277000,393000,551000,565000,548000,559000,570000,575000,581000,...,649000,659000,668000,675000,681000,684000,679000,683000,692000,
1,Kaernten,296000,390000,554000,567000,516000,524000,534000,541000,547000,...,623000,632000,641000,648000,654000,656000,654000,656000,661000,
2,Niederoesterreich,307000,431000,560000,570000,550000,561000,566000,570000,574000,...,633000,641000,649000,654000,659000,661000,655000,657000,662000,
3,Oberoesterreich,305000,416000,531000,542000,522000,531000,535000,541000,546000,...,614000,622000,630000,636000,641000,643000,639000,641000,645000,
4,Salzburg,321000,418000,479000,487000,474000,483000,482000,484000,488000,...,551000,557000,564000,567000,570000,572000,569000,569000,573000,


In [14]:
n = insert_df(COLLECTION_NAME, df, batch_size=BATCH_SIZE, add_meta=True)
print(f"Insgesamt {n} Dokumente in Collection '{COLLECTION_NAME}' eingefügt.")

Insert -> Verkehr2: 100%|██████████| 1/1 [00:00<00:00, 27.78it/s]

Insgesamt 10 Dokumente in Collection 'Verkehr2' eingefügt.



