# MVP validation workflow for the Oregon SQM Dashboard project.

Steps:
1. Load raw data tables and display basic info
2. Perform geocoding for all sites & save results
4. Visualize geocoded sites on a Folium map
5. Show bar-charts and scatter-plots.
6. Smoke-test Dash, Flask, and (lightweight) Streamlit app imports
7. Summarize results

## Load raw data tables and display basic info

In [None]:
# Import necessary libraries
from pathlib import Path
import json
import pandas as pd
import importlib
import sys
import logging

In [None]:
# Ensure project root is on path (one level up from 'development')
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Also add the shared directory so that `utils` package can be resolved (matches streamlit_app logic)
SHARED_DIR = PROJECT_ROOT / 'shared'
if str(SHARED_DIR) not in sys.path:
    sys.path.insert(0, str(SHARED_DIR))

RAW_DIR = PROJECT_ROOT / 'shared' / 'data' / 'raw'
PROCESSED_DIR = PROJECT_ROOT / 'shared' / 'data' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(level=logging.INFO)
print(f"Project root: {PROJECT_ROOT}")
print(f"Shared dir added to sys.path: {SHARED_DIR}")
print(f"Raw data dir: {RAW_DIR}")
print(f"Processed data dir: {PROCESSED_DIR}")

# List raw CSVs
for p in RAW_DIR.glob('*.csv'):
    print('Found raw file:', p.name)

# (Optional) quick confirmation that utils is importable
try:
    import utils  # noqa: F401
    print('utils package import: OK')
except Exception as e:
    print('utils package import FAILED:', e)

Project root: /home/vidit-agrawal/projects/darksky-oregon-dashboard
Raw data dir: /home/vidit-agrawal/projects/darksky-oregon-dashboard/shared/data/raw
Processed data dir: /home/vidit-agrawal/projects/darksky-oregon-dashboard/shared/data/processed
Found raw file: sites_locations.csv
Found raw file: clear_night_measurements.csv


In [18]:
# 1. Load raw data tables
raw_files = ['sites_locations.csv','clear_night_measurements.csv']
raw_dfs = {}
for fname in raw_files:
    fpath = RAW_DIR / fname
    if fpath.exists():
        df = pd.read_csv(fpath)
        raw_dfs[fname] = df
        print(f"Loaded {fname}: {df.shape[0]} rows, {df.shape[1]} cols")
        display(df.head(3))
    else:
        print(f"Missing raw file: {fname}")

# Basic schema/NA validation
validation_report = {}
for name, df in raw_dfs.items():
    validation_report[name] = {
        'row_count': len(df),
        'missing_any': int(df.isna().any().any()),
        'duplicate_rows': int(df.duplicated().sum()),
        'columns': list(df.columns)
    }

print('\nValidation summary:')
for k,v in validation_report.items():
    print(k, '=>', v)

Loaded sites_locations.csv: 63 rows, 2 cols


Unnamed: 0,Name,Install Number
0,Awbrey Butte,1
1,Pine Mountain Observatory,2
2,Prineville Reservoir State Park,3


Loaded clear_night_measurements.csv: 58 rows, 5 cols


Unnamed: 0,site_name,median_brightness_mag_arcsec2,bortle_sky_level,median_linear_scale_flux_ratio,x_brighter_than_darkest_night_sky
0,Eugene Downtown,18.27,7,12.36,26.79
1,Portland SE,18.45,7,10.47,22.7
2,Westmoreland Park,18.59,6,9.2,19.95



Validation summary:
sites_locations.csv => {'row_count': 63, 'missing_any': 0, 'duplicate_rows': 0, 'columns': ['Name', 'Install Number']}
clear_night_measurements.csv => {'row_count': 58, 'missing_any': 0, 'duplicate_rows': 0, 'columns': ['site_name', 'median_brightness_mag_arcsec2', 'bortle_sky_level', 'median_linear_scale_flux_ratio', 'x_brighter_than_darkest_night_sky']}


In [19]:
raw_dfs['sites_locations.csv'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            63 non-null     object
 1   Install Number  63 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [21]:
raw_dfs['clear_night_measurements.csv'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   site_name                          58 non-null     object 
 1   median_brightness_mag_arcsec2      58 non-null     float64
 2   bortle_sky_level                   58 non-null     int64  
 3   median_linear_scale_flux_ratio     58 non-null     float64
 4   x_brighter_than_darkest_night_sky  58 non-null     float64
dtypes: float64(3), int64(1), object(1)
memory usage: 2.4+ KB


## Perform geocoding for all sites & save results

In [None]:
# Ensure path adjustments (in case this cell is run standalone)
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
SHARED_DIR = PROJECT_ROOT / 'shared'
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
if str(SHARED_DIR) not in sys.path:
    sys.path.insert(0, str(SHARED_DIR))

from utils.geocoding import OregonGeocoder

# Instantiate geocoder and run a limited batch (you can expand later)
geocoder = OregonGeocoder(cache_path=PROJECT_ROOT / 'shared' / 'data' / 'processed' / 'geocode_cache.json')
site_names = raw_dfs['sites_locations.csv']['Name'].tolist() if 'sites_locations.csv' in raw_dfs else []
print(f'Total site names to geocode: {len(site_names)} (showing first 5):', site_names[:5])

# Example: limit to first 10 for quick run
subset = site_names[:10]
print('Geocoding subset of size:', len(subset))

auto_confirm = True
if auto_confirm:
    geocode_results = geocoder.geocode_batch(subset, pause_seconds=1.0)
else:
    geocode_results = {name: geocoder.geocode_site(name) for name in subset}

print('Geocoding complete. Successful coords:', sum(1 for v in geocode_results.values() if v))
list(geocode_results.items())[:5]

ModuleNotFoundError: No module named 'utils'

## Visualize geocoded sites on a Folium map

In [None]:
# 4. Map visualization (requires successful geocoding)
import folium

coords = {k:v for k,v in geocode_results.items() if v}
if coords:
    # Center roughly on Oregon
    fmap = folium.Map(location=[43.9,-120.6], zoom_start=6, tiles='CartoDB positron')
    for name,(lat,lon) in coords.items():
        folium.Marker([lat,lon], popup=name).add_to(fmap)
    display(fmap)
else:
    print('No coordinates to map yet.')

In [None]:
# 5. Smoke-test app entry points (import only)
results = {}
modules_to_test = {
    'dash_app.app': 'dash',
    'flask_api.app': 'flask',
    'streamlit_app.app': 'streamlit'
}
for mod, label in modules_to_test.items():
    try:
        importlib.import_module(mod)
        results[mod] = 'import ok'
    except Exception as e:
        results[mod] = f'FAILED: {e}'
results

## 6. Summary
This MVP notebook performed:
- Data loading & quick validation for raw CSVs
- Processing via `OregonSQMProcessor`
- Sample geocoding & caching
- Map visualization using Folium
- Import smoke tests for Dash, Flask, Streamlit entry points

Next potential steps:
- Add pytest integration in a separate test module
- Expand geocoding to all sites with rate limiting & progress
- Compute derived KPIs (darkest sites, trend deltas)
- Build interactive comparison plots
