### Luftqualität Database upload

In [1]:
from pymongo import MongoClient, ReplaceOne
from dotenv import load_dotenv
import json
import pandas as pd
import os
from pymongo.errors import BulkWriteError

# load environment variables from .env
load_dotenv()

mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
mongo_host = os.getenv("MONGO_HOST")
mongo_port = os.getenv("MONGO_PORT")
mongo_db   = os.getenv("MONGO_DB")

# build connection URI and connect
uri = f"mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/"
client = MongoClient(uri)
db = client[mongo_db]

# Now `db` can be used to access collections. The next cell contains upload helpers and the upload loop.

In [2]:
def _guess_country_field(columns):
    candidates = ['Country','country','Country Name','Entity','Location','State','Region','Country/Region']
    for c in candidates:
        if c in columns:
            return c
    return None

def _guess_year_field(columns):
    candidates = ['Year','year','TIME','Year_']
    for c in candidates:
        if c in columns:
            return c
    return None

def load_and_upload(csv_path, collection_name='state_of_global_air', region=None, chunk_size=1000):
    from pandas.errors import ParserError
    print(f'Loading: {csv_path}')
    # Try several common encodings and delimiters to handle regional CSV formats
    read_attempts = [
        {'encoding': 'utf-8', 'sep': ','},
        {'encoding': 'utf-8', 'sep': ';'},
        {'encoding': 'latin1', 'sep': ','},
        {'encoding': 'latin1', 'sep': ';'},
        {'encoding': 'utf-8', 'sep': ',', 'engine': 'python', 'on_bad_lines': 'skip'},
        {'encoding': 'latin1', 'sep': ';', 'engine': 'python', 'on_bad_lines': 'skip'},
    ]
    df = None
    last_err = None
    for opts in read_attempts:
        try:
            df = pd.read_csv(csv_path, low_memory=False, **opts)
            print(f'Loaded with opts: {opts}')
            break
        except ParserError as pe:
            last_err = pe
        except Exception as e:
            last_err = e
    if df is None:
        print('Failed to parse CSV using multiple strategies. Last error:')
        print(repr(last_err))
        # show first 1024 raw bytes to help debugging
        try:
            with open(csv_path, 'rb') as f:
                sample = f.read(1024).decode('utf-8', errors='replace')
            print('File sample (first 1024 bytes, utf-8 replace):')
            print(sample)
        except Exception:
            pass
        return

    # normalize missing values to None for MongoDB
    df = df.where(pd.notnull(df), None)
    if region is not None:
        if 'region' not in df.columns:
            df['region'] = region
    records = df.to_dict(orient='records')
    coll = db[collection_name]

    country_field = _guess_country_field(df.columns)
    year_field = _guess_year_field(df.columns)

    if country_field and year_field:
        print(f'Upserting using keys: {country_field}, {year_field} (bulk)')
        ops = []
        for rec in records:
            filter_doc = {country_field: rec.get(country_field), year_field: rec.get(year_field)}
            ops.append(ReplaceOne(filter_doc, rec, upsert=True))
            if len(ops) >= chunk_size:
                coll.bulk_write(ops, ordered=False)
                ops = []
        if ops:
            coll.bulk_write(ops, ordered=False)
        print(f'Upsert completed for {csv_path}')
    else:
        # fallback to insert_many; ignore duplicate key errors if they happen
        print('No good key fields found — inserting documents (duplicates may error)')
        try:
            if records:
                coll.insert_many(records, ordered=False)
                print(f'Inserted {len(records)} documents into {collection_name}')
            else:
                print('No records found in file')
        except BulkWriteError as bwe:
            print('Bulk write error (some inserts may have failed due to duplicates):')
            print(bwe.details)

# Helper to determine region name from filename
def _region_from_filename(fname):
    lowered = fname.lower()
    if 'austria' in lowered:
        return 'Austria'
    if 'europe' in lowered:
        return 'Europe'
    if 'world' in lowered:
        return 'World'
    return None

# Main: iterate over CSVs in Data/ and upload into separate collections per region/file
data_dir = 'Data'
if not os.path.isdir(data_dir):
    print('Data directory not found; please ensure the CSVs exist in a folder named `Data`')
else:
    files = [f for f in os.listdir(data_dir) if f.startswith('LuftqualitätStateOfGlobalAir') and f.lower().endswith('.csv')]
    if not files:
        print('No matching CSV files found in Data/. Make sure the CSVs are in the `Data` directory.')
    else:
        for fname in files:
            full = os.path.join(data_dir, fname)
            region = _region_from_filename(fname)
            # choose collection per region; fallback to filename-based name
            if region:
                coll_name = f'state_of_global_air_{region.lower()}'
            else:
                # create a safe collection name from filename
                base = os.path.splitext(fname)[0]
                safe = base.replace(' ', '_').replace('-', '_').lower()
                coll_name = f'state_of_global_air_{safe}'
            print(f'Uploading {fname} -> collection: {coll_name}')
            load_and_upload(full, collection_name=coll_name, region=region)
        print('All files processed.')

Uploading LuftqualitätStateOfGlobalAirEurope.csv -> collection: state_of_global_air_europe
Loading: Data\LuftqualitätStateOfGlobalAirEurope.csv
Loaded with opts: {'encoding': 'utf-8', 'sep': ','}
No good key fields found — inserting documents (duplicates may error)
Inserted 852 documents into state_of_global_air_europe
Uploading LuftqualitätStateOfGlobalAirWorld.csv -> collection: state_of_global_air_world
Loading: Data\LuftqualitätStateOfGlobalAirWorld.csv
Loaded with opts: {'encoding': 'utf-8', 'sep': ','}
No good key fields found — inserting documents (duplicates may error)
Inserted 6972 documents into state_of_global_air_world
Uploading LuftqualitätStateOfGlobalAirAustria.csv -> collection: state_of_global_air_austria
Loading: Data\LuftqualitätStateOfGlobalAirAustria.csv
Loaded with opts: {'encoding': 'utf-8', 'sep': ';'}
No good key fields found — inserting documents (duplicates may error)
Inserted 36 documents into state_of_global_air_austria
All files processed.
