### Luftqualität Database upload

In [15]:
from pymongo import MongoClient
from dotenv import load_dotenv
import json
import pandas as pd
import os
from pymongo.errors import BulkWriteError

# load environment variables from .env
load_dotenv()

mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST")
mongo_port = os.getenv("MONGO_PORT")
mongo_db   = os.getenv("MONGO_DB")

uri = f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/"
client = MongoClient(uri)
db = client[mongo_db]

# now you can use `db` to access collections


In [16]:
import re

def _clean_string_cell(s):
    """Normalize strings like ="..." produced by some CSV/Excel exports."""
    if not isinstance(s, str):
        return s
    s = s.strip()
    # remove a leading '=' if present (e.g. =)
    if s.startswith('='):
        s = s[1:]
    # remove surrounding double quotes if present
    if len(s) >= 2 and s[0] == '-1':
        s = s[1:-1]
    # replace repeated double-quotes with a single double-quote (CSV escape)
    s = s.replace('""', '"')
    if not isinstance(s, str):
        return s
    s = s.strip()
    if s == '':
        return None
    # integer
    if re.fullmatch(r'-?+', s):
        try:
            return int(s)
        except Exception:
            return s
    # float
    if re.fullmatch(r'-?++', s):
        try:
            return float(s)
        except Exception:
            return s
    return s

def clean_dataframe(df):
    """Clean DataFrame in-place: remove Excel-style wrappers and convert numeric-like strings."""
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    if not obj_cols:
        return df
    for col in obj_cols:
        df[col] = df[col].apply(_clean_string_cell)
        df[col] = df[col].apply(_try_convert_numeric)
    return df

# Function: upload a CSV to MongoDB in chunks with preprocessing
def upload_csv_to_mongo(csv_path, collection_name, chunk_size=1000, db=db, preprocess=True):
    """Read a CSV in chunks, optionally preprocess the chunk, and insert into MongoDB."""
    collection = db[collection_name]
    if not os.path.exists(csv_path):
        print(f'File not found: {csv_path}')
        return
    total = 0
    try:
        for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size, encoding='utf-8')):
            if preprocess:
                chunk = clean_dataframe(chunk)
            # convert pandas NA to None for MongoDB
            records = chunk.where(pd.notnull(chunk), None).to_dict(orient='records')
            if not records:
                continue
            try:
                res = collection.insert_many(records, ordered=False)
                inserted = len(res.inserted_ids) if res.inserted_ids is not None else 0
                total += inserted
                print(f'Chunk {i+1}: inserted {inserted} documents')
            except BulkWriteError as bwe:
                details = bwe.details if hasattr(bwe, 'details') else str(bwe)
                print(f'Chunk {i+1}: Bulk write error, details=', details)
    except Exception as e:
        print('Error while reading/inserting:', e)
    print(f'Finished uploading {csv_path} -> {collection_name}. Total inserted: {total}')

# Example usage: upload the CSV files from the Data folder (adjust names if needed)
csv_files = [
    (os.path.join('Data', 'LuftqualitätStateOfGlobalAirAustria.csv'), 'state_global_air_austria'),
    (os.path.join('Data', 'LuftqualitätStateOfGlobalAirEurope.csv'), 'state_global_air_europe'),
    (os.path.join('Data', 'LuftqualitätStateOfGlobalAirWorld.csv'), 'state_global_air_world'),
]

for path, coll in csv_files:
    if os.path.exists(path):
        print(f'Uploading {path} -> collection "{coll}"')
        upload_csv_to_mongo(path, coll, chunk_size=500)
    else:
        print(f'Skipping missing file: {path}')

# Optional: create a simple index (wrapped in try/except because columns may differ)
try:
    c = db['state_global_air_austria']
    # adjust field names to your CSV columns; example: 'Country' and 'Year'
    c.create_index([('Country', 1), ('Year', 1)], name='country_year_idx', background=True)
    print('Created index country_year_idx on state_global_air_austria (if columns exist)')
except Exception as e:
    print('Index creation skipped or failed:', e)


Uploading Data\LuftqualitätStateOfGlobalAirAustria.csv -> collection "state_global_air_austria"
Error while reading/inserting: multiple repeat at position 3
Finished uploading Data\LuftqualitätStateOfGlobalAirAustria.csv -> state_global_air_austria. Total inserted: 0
Uploading Data\LuftqualitätStateOfGlobalAirEurope.csv -> collection "state_global_air_europe"
Error while reading/inserting: multiple repeat at position 3
Finished uploading Data\LuftqualitätStateOfGlobalAirEurope.csv -> state_global_air_europe. Total inserted: 0
Uploading Data\LuftqualitätStateOfGlobalAirWorld.csv -> collection "state_global_air_world"
Error while reading/inserting: multiple repeat at position 3
Finished uploading Data\LuftqualitätStateOfGlobalAirWorld.csv -> state_global_air_world. Total inserted: 0
Created index country_year_idx on state_global_air_austria (if columns exist)
