<h1 style="text-align: center">Database to JSON Export Pipeline</h1>

This notebook extracts data from the PostgreSQL database and converts it to JSON/GeoJSON files for the frontend application. The exported files are placed in `../frontend/public/data/` for static serving.

To update json data files, simply re-run all of these cells and the json files will be overwritten with updated value from the PostgreSQL database. 

## AVAs

In [1]:
import os
import json
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
dotenv_path = '../.env'
load_dotenv(dotenv_path=dotenv_path)

DATABASE_URL = os.getenv('DATABASE_URL')
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file. Make sure .env file exists in the 'backend' directory.")

# Create database connection
engine = create_engine(DATABASE_URL)

# Define output directory
output_dir = Path('../../frontend/public/data/')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Database connection configured")
print(f"Output directory: {output_dir.absolute()}")
print(f"Setup complete.")

Database connection configured
Output directory: /home/tabernater/projects/VitisVeritas/backend/notebooks/../../frontend/public/data
Setup complete.


In [3]:
# Query to fetch all AVAs with their geometry
avas_query = """
SELECT 
    id,
    name,
    description,
    ST_AsGeoJSON(geom) as geometry,
    created_at,
    updated_at
FROM avas
ORDER BY name;
"""

print("Fetching AVAs from database...")

# Execute query and create GeoDataFrame
with engine.connect() as connection:
    df_avas = pd.read_sql(avas_query, connection)

print(f"Retrieved {len(df_avas)} AVAs from database")

# Convert to GeoDataFrame
from shapely.geometry import shape

# Parse the geometry from GeoJSON strings using shapely
geometries = df_avas['geometry'].apply(
    lambda x: shape(json.loads(x)) if x else None
)

# Create GeoDataFrame
gdf_avas = gpd.GeoDataFrame(
    df_avas.drop('geometry', axis=1),
    geometry=geometries,
    crs='EPSG:4326'
)

# Display basic info
print(f"AVAs data preview:")
print(f"   - Total AVAs: {len(gdf_avas)}")
print(f"   - Columns: {list(gdf_avas.columns)}")
print(f"   - CRS: {gdf_avas.crs}")
print(f"   - Geometry types: {gdf_avas.geometry.geom_type.value_counts().to_dict()}")

# Display first few rows
print(f"\nFirst 3 AVAs:")
for idx, row in gdf_avas.head(3).iterrows():
    print(f"   - {row['name']}: {len(row['description'] or 'No description')} chars description")

📥 Fetching AVAs from database...
✅ Retrieved 12 AVAs from database
📊 AVAs data preview:
   - Total AVAs: 12
   - Columns: ['id', 'name', 'description', 'created_at', 'updated_at', 'geometry']
   - CRS: EPSG:4326
   - Geometry types: {'MultiPolygon': 12}

📋 First 3 AVAs:
   - Chehalem Mountains: 14 chars description
   - Dundee Hills: 14 chars description
   - Eola-Amity Hills: 14 chars description


In [4]:
# Export to GeoJSON file
avas_output_path = output_dir / 'avas.geojson'

print(f"Exporting AVAs to: {avas_output_path}")

# Export as GeoJSON
gdf_avas.to_file(avas_output_path, driver='GeoJSON')

# Verify the file was created and get file size
if avas_output_path.exists():
    file_size_mb = avas_output_path.stat().st_size / (1024 * 1024)
    print(f"Successfully exported AVAs!")
    print(f"   - File: {avas_output_path}")
    print(f"   - Size: {file_size_mb:.2f} MB")
    print(f"   - Features: {len(gdf_avas)}")
    
    # Read a snippet to verify structure
    with open(avas_output_path, 'r') as f:
        sample = f.read(200)
        print(f"   - Sample content: {sample}...")
else:
    print("Failed to create AVAs GeoJSON file")

💾 Exporting AVAs to: ../../frontend/public/data/avas.geojson
✅ Successfully exported AVAs!
   - File: ../../frontend/public/data/avas.geojson
   - Size: 3.48 MB
   - Features: 12
   - Sample content: {
"type": "FeatureCollection",
"name": "avas",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 10, "name"...
✅ Successfully exported AVAs!
   - File: ../../frontend/public/data/avas.geojson
   - Size: 3.48 MB
   - Features: 12
   - Sample content: {
"type": "FeatureCollection",
"name": "avas",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 10, "name"...


## Wineries

In [2]:
# Query to fetch all wineries with their geometry and related AVA information
wineries_query = """
SELECT 
    w.id,
    w.name,
    w.address,
    w.website_url,
    w.description,
    w.phone,
    w.ava_id,
    a.name as ava_name,
    ST_AsGeoJSON(w.location) as geometry,
    w.created_at,
    w.updated_at
FROM wineries w
LEFT JOIN avas a ON w.ava_id = a.id
WHERE w.location IS NOT NULL  -- Only include wineries with valid location data
ORDER BY w.name;
"""

print("Fetching wineries from database...")

# Execute query and create DataFrame
with engine.connect() as connection:
    df_wineries = pd.read_sql(wineries_query, connection)

print(f"Retrieved {len(df_wineries)} wineries from database")

# Convert to GeoDataFrame
from shapely.geometry import shape

# Parse the geometry from GeoJSON strings using shapely
geometries = df_wineries['geometry'].apply(
    lambda x: shape(json.loads(x)) if x else None
)

# Create GeoDataFrame
gdf_wineries = gpd.GeoDataFrame(
    df_wineries.drop('geometry', axis=1),
    geometry=geometries,
    crs='EPSG:4326'
)

print(f"Wineries data preview:")
print(f"   - Total wineries: {len(gdf_wineries)}")
print(f"   - Columns: {list(gdf_wineries.columns)}")
print(f"   - CRS: {gdf_wineries.crs}")
print(f"   - Geometry types: {gdf_wineries.geometry.geom_type.value_counts().to_dict()}")

print(f"\nFirst 3 wineries:")
for idx, row in gdf_wineries.head(3).iterrows():
    ava_info = f" ({row['ava_name']})" if row['ava_name'] else " (No AVA)"
    desc_length = len(row['description'] or 'No description')
    print(f"   - {row['name']}{ava_info}: {desc_length} chars description")

# Check for any wineries without location data
no_location_count = df_wineries['geometry'].isna().sum()
if no_location_count > 0:
    print(f"\nNote: {no_location_count} wineries in database have no location data and were excluded")

Fetching wineries from database...
Retrieved 134 wineries from database
Wineries data preview:
   - Total wineries: 134
   - Columns: ['id', 'name', 'address', 'website_url', 'description', 'phone', 'ava_id', 'ava_name', 'created_at', 'updated_at', 'geometry']
   - CRS: EPSG:4326
   - Geometry types: {'Point': 134}

First 3 wineries:
   - Aeolus Vineyards (Eola-Amity Hills): 385 chars description
   - Airlie Winery (Willamette Valley): 363 chars description
   - Alexeli Vineyard + Winery (Willamette Valley): 385 chars description


In [3]:
# Export wineries to GeoJSON file
wineries_output_path = output_dir / 'wineries.geojson'

print(f"Exporting wineries to: {wineries_output_path}")

# Export as GeoJSON
gdf_wineries.to_file(wineries_output_path, driver='GeoJSON')

# Verify the file was created and get file size
if wineries_output_path.exists():
    file_size_mb = wineries_output_path.stat().st_size / (1024 * 1024)
    print(f"Successfully exported wineries!")
    print(f"   - File: {wineries_output_path}")
    print(f"   - Size: {file_size_mb:.2f} MB")
    print(f"   - Features: {len(gdf_wineries)}")
    
    # Read a snippet to verify structure
    with open(wineries_output_path, 'r') as f:
        sample = f.read(300)
        print(f"   - Sample content: {sample}...")
else:
    print("Failed to create wineries GeoJSON file")

# Display summary of AVA distribution
print(f"\nWineries by AVA:")
ava_counts = gdf_wineries['ava_name'].fillna('No AVA assigned').value_counts()
for ava, count in ava_counts.head(10).items():
    print(f"   - {ava}: {count} wineries")
if len(ava_counts) > 10:
    print(f"   - ... and {len(ava_counts) - 10} more AVAs")

Exporting wineries to: ../../frontend/public/data/wineries.geojson
Successfully exported wineries!
   - File: ../../frontend/public/data/wineries.geojson
   - Size: 0.11 MB
   - Features: 134
   - Sample content: {
"type": "FeatureCollection",
"name": "wineries",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 80, "name": "Aeolus Vineyards", "address": "3935 Oak Knoll Rd NW, Salem, OR 97304", "website_url": "http:/...

Wineries by AVA:
   - Willamette Valley: 75 wineries
   - Eola-Amity Hills: 41 wineries
   - Van Duzer: 8 wineries
   - Lower Long Tom: 7 wineries
   - Mt Pisgah Polk County: 3 wineries


## Soils

In [1]:
import os
import json
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from pathlib import Path
from shapely.geometry import shape

dotenv_path = '../.env'
load_dotenv(dotenv_path=dotenv_path)

DATABASE_URL = os.getenv('DATABASE_URL')
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file.")

engine = create_engine(DATABASE_URL)

output_dir = Path('../../frontend/public/data/')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Database connection configured")
print(f"Output directory: {output_dir.absolute()}")
print("Setup complete.")

Database connection configured
Output directory: /home/tabernater/projects/VitisVeritas/backend/notebooks/../../frontend/public/data
Setup complete.


In [2]:
# Query to fetch all soils with their geometry and metadata
# Based on actual table structure: mukey, musym, muname, geom
soils_query = """
SELECT 
    mukey,
    musym,
    muname,
    ST_AsGeoJSON(geom) as geometry
FROM soils
ORDER BY mukey;
"""

print("Fetching soils data from database...")

# Execute query
with engine.connect() as connection:
    df_soils = pd.read_sql(soils_query, connection)

print(f"Retrieved {len(df_soils)} soil polygons from database")

# Display sample data
print(f"\nSample soil data:")
for idx, row in df_soils.head(3).iterrows():
    print(f"   - {row['mukey']} ({row['musym']}): {row['muname']}")

Fetching soils data from database...
Retrieved 24673 soil polygons from database

Sample soil data:
   - 1404559 (1E): Apt-McDuff complex, 5 to 30 percent slopes
   - 1404559 (1E): Apt-McDuff complex, 5 to 30 percent slopes
   - 1404559 (1E): Apt-McDuff complex, 5 to 30 percent slopes


In [3]:
print("Converting to GeoDataFrame...")

# Parse the geometry from GeoJSON strings
geometries = df_soils['geometry'].apply(
    lambda x: shape(json.loads(x)) if x else None
)

# Create GeoDataFrame
gdf_soils = gpd.GeoDataFrame(
    df_soils.drop('geometry', axis=1),
    geometry=geometries,
    crs='EPSG:4326'
)

# Display basic info
print(f"Soils GeoDataFrame created:")
print(f"   - Total features: {len(gdf_soils)}")
print(f"   - Columns: {list(gdf_soils.columns)}")
print(f"   - CRS: {gdf_soils.crs}")
print(f"   - Geometry types: {gdf_soils.geometry.geom_type.value_counts().to_dict()}")

# Check for any null geometries
null_geoms = gdf_soils.geometry.isnull().sum()
if null_geoms > 0:
    print(f"   - Warning: {null_geoms} null geometries found")

Converting to GeoDataFrame...
Soils GeoDataFrame created:
   - Total features: 24673
   - Columns: ['mukey', 'musym', 'muname', 'geometry']
   - CRS: EPSG:4326
   - Geometry types: {'MultiPolygon': 24673}


In [4]:
# Export to GeoJSON file
soils_output_path = output_dir / 'soils.geojson'

print(f"Exporting soils to: {soils_output_path}")

# Export as GeoJSON
gdf_soils.to_file(soils_output_path, driver='GeoJSON')

# Verify the file was created
if soils_output_path.exists():
    file_size_mb = soils_output_path.stat().st_size / (1024 * 1024)
    print(f"Successfully exported soils GeoJSON!")
    print(f"   - File: {soils_output_path}")
    print(f"   - Size: {file_size_mb:.2f} MB")
    print(f"   - Features: {len(gdf_soils)}")
else:
    print("Failed to create soils GeoJSON file")

Exporting soils to: ../../frontend/public/data/soils.geojson
Successfully exported soils GeoJSON!
   - File: ../../frontend/public/data/soils.geojson
   - Size: 74.30 MB
   - Features: 24673


In [5]:
print("\nValidating GeoJSON structure...")

# Read back and verify
try:
    with open(soils_output_path, 'r') as f:
        geojson_data = json.load(f)
    
    print(f"GeoJSON validation:")
    print(f"   - Type: {geojson_data.get('type', 'Unknown')}")
    print(f"   - Features count: {len(geojson_data.get('features', []))}")
    
    # Check first feature structure
    if geojson_data.get('features'):
        first_feature = geojson_data['features'][0]
        properties = first_feature.get('properties', {})
        print(f"   - Sample properties: {list(properties.keys())}")
        print(f"   - Sample mukey: {properties.get('mukey', 'N/A')}")
        print(f"   - Sample soil name: {properties.get('muname', 'N/A')}")
        print(f"   - Sample musym: {properties.get('musym', 'N/A')}")
    
except Exception as e:
    print(f"Error validating GeoJSON: {e}")


Validating GeoJSON structure...
GeoJSON validation:
   - Type: FeatureCollection
   - Features count: 24673
   - Sample properties: ['mukey', 'musym', 'muname']
   - Sample mukey: 1404559
   - Sample soil name: Apt-McDuff complex, 5 to 30 percent slopes
   - Sample musym: 1E
