In [1]:
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd
from datetime import datetime

#mongoDB connection
MONGO_URI = "mongodb+srv://nawababbas08_db_user:2Ja4OGlDdKfG6EvZ@cluster0.jnxn95g.mongodb.net/?retryWrites=true&w=majority"
DATABASE_NAME = "aqi_feature_store"
COLLECTION_NAME = "aqi_features"
SOURCE_CSV = "data/cleaned_aqi_data_v2.csv"

print("Connecting to MongoDB Atlas...")
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
client.admin.command('ping')
print("Connected!\n")

# Load data
print("Loading data...")
df = pd.read_csv(SOURCE_CSV)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns\n")

# Find timestamp column
timestamp_col = None
for col in df.columns:
    if 'time' in col.lower():
        df[col] = pd.to_datetime(df[col])
        timestamp_col = col
        break

if timestamp_col:
    df['timestamp'] = df[timestamp_col]
else:
    df['timestamp'] = datetime.now()

#convert to records
records = df.to_dict('records')
for record in records:
    if isinstance(record.get('timestamp'), str):
        record['timestamp'] = pd.to_datetime(record['timestamp']).to_pydatetime()

#upload to MongoDB
print("Uploading to MongoDB...")
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

collection.delete_many({})  # Clear existing data
collection.insert_many(records)
collection.create_index([("timestamp", 1)])

print(f"Uploaded {len(records)} records!\n")

#save metadata
metadata = {
    "upload_date": datetime.now(),
    "total_records": len(records),
    "columns": list(df.columns),
    "date_range": {
        "start": str(df['timestamp'].min()),
        "end": str(df['timestamp'].max())
    }
}

metadata_collection = db['feature_metadata']
metadata_collection.delete_many({})
metadata_collection.insert_one(metadata)
print("SUCCESS!")
print(f"Database: {DATABASE_NAME}")
print(f"Collection: {COLLECTION_NAME}")
print(f"Records: {len(records)}")
print(f"\nView at: https://cloud.mongodb.com")

#query example
print("\n" + "=" * 60)
print("Query your data:")
print("=" * 60)

#get latest 5 records
latest = pd.DataFrame(list(collection.find({}).sort('timestamp', -1).limit(5)))
if '_id' in latest.columns:
    latest = latest.drop('_id', axis=1)

print("\nLatest 5 records:")
print(latest)

#get statistics
print(f"\nTotal records in database: {collection.count_documents({})}")


client.close()
print("\nDone!")

Connecting to MongoDB Atlas...
Connected!

Loading data...
Loaded 4340 rows, 46 columns

Uploading to MongoDB...
Uploaded 4340 records!

SUCCESS!
Database: aqi_feature_store
Collection: aqi_features
Records: 4340

View at: https://cloud.mongodb.com

Query your data:

Latest 5 records:
                 time  pm2_5  pm10  nitrogen_dioxide  ozone  sulphur_dioxide  \
0 2026-02-16 19:00:00   31.1  39.1              43.9   34.0             13.9   
1 2026-02-16 18:00:00   29.9  37.8              45.6   34.0             15.1   
2 2026-02-16 17:00:00   27.9  36.0              42.6   39.0             15.2   
3 2026-02-16 16:00:00   26.0  35.0              36.7   47.0             14.7   
4 2026-02-16 15:00:00   24.6  34.9              30.0   56.0             14.3   

   carbon_monoxide  aqi_pm25  aqi_pm10  aqi  ...  aqi_ma_24h  aqi_std_24h  \
0            524.0        91        36   91  ...  111.041667    84.711007   
1            544.0        88        35   88  ...  112.250000    84.619481   
2 