<h1 style="text-align: center">Database to JSON Export Pipeline</h1>

This notebook extracts data from the PostgreSQL database and converts it to JSON/GeoJSON files for the frontend application. The exported files are placed in `../frontend/public/data/` for static serving.

To update json data files, simply re-run all of these cells and the json files will be overwritten with updated value from the PostgreSQL database. 

## AVAs

In [1]:
import os
import json
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
dotenv_path = '../.env'
load_dotenv(dotenv_path=dotenv_path)

DATABASE_URL = os.getenv('DATABASE_URL')
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file. Make sure .env file exists in the 'backend' directory.")

# Create database connection
engine = create_engine(DATABASE_URL)

# Define output directory
output_dir = Path('../../frontend/public/data/')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Database connection configured")
print(f"Output directory: {output_dir.absolute()}")
print(f"Setup complete.")

Database connection configured
Output directory: /home/tabernater/projects/VitisVeritas/backend/notebooks/../../frontend/public/data
Setup complete.


In [3]:
# Query to fetch all AVAs with their geometry
avas_query = """
SELECT 
    id,
    name,
    description,
    ST_AsGeoJSON(geom) as geometry,
    created_at,
    updated_at
FROM avas
ORDER BY name;
"""

print("Fetching AVAs from database...")

# Execute query and create GeoDataFrame
with engine.connect() as connection:
    df_avas = pd.read_sql(avas_query, connection)

print(f"Retrieved {len(df_avas)} AVAs from database")

# Convert to GeoDataFrame
from shapely.geometry import shape

# Parse the geometry from GeoJSON strings using shapely
geometries = df_avas['geometry'].apply(
    lambda x: shape(json.loads(x)) if x else None
)

# Create GeoDataFrame
gdf_avas = gpd.GeoDataFrame(
    df_avas.drop('geometry', axis=1),
    geometry=geometries,
    crs='EPSG:4326'
)

# Display basic info
print(f"AVAs data preview:")
print(f"   - Total AVAs: {len(gdf_avas)}")
print(f"   - Columns: {list(gdf_avas.columns)}")
print(f"   - CRS: {gdf_avas.crs}")
print(f"   - Geometry types: {gdf_avas.geometry.geom_type.value_counts().to_dict()}")

# Display first few rows
print(f"\nFirst 3 AVAs:")
for idx, row in gdf_avas.head(3).iterrows():
    print(f"   - {row['name']}: {len(row['description'] or 'No description')} chars description")

📥 Fetching AVAs from database...
✅ Retrieved 12 AVAs from database
📊 AVAs data preview:
   - Total AVAs: 12
   - Columns: ['id', 'name', 'description', 'created_at', 'updated_at', 'geometry']
   - CRS: EPSG:4326
   - Geometry types: {'MultiPolygon': 12}

📋 First 3 AVAs:
   - Chehalem Mountains: 14 chars description
   - Dundee Hills: 14 chars description
   - Eola-Amity Hills: 14 chars description


In [4]:
# Export to GeoJSON file
avas_output_path = output_dir / 'avas.geojson'

print(f"Exporting AVAs to: {avas_output_path}")

# Export as GeoJSON
gdf_avas.to_file(avas_output_path, driver='GeoJSON')

# Verify the file was created and get file size
if avas_output_path.exists():
    file_size_mb = avas_output_path.stat().st_size / (1024 * 1024)
    print(f"Successfully exported AVAs!")
    print(f"   - File: {avas_output_path}")
    print(f"   - Size: {file_size_mb:.2f} MB")
    print(f"   - Features: {len(gdf_avas)}")
    
    # Read a snippet to verify structure
    with open(avas_output_path, 'r') as f:
        sample = f.read(200)
        print(f"   - Sample content: {sample}...")
else:
    print("Failed to create AVAs GeoJSON file")

💾 Exporting AVAs to: ../../frontend/public/data/avas.geojson
✅ Successfully exported AVAs!
   - File: ../../frontend/public/data/avas.geojson
   - Size: 3.48 MB
   - Features: 12
   - Sample content: {
"type": "FeatureCollection",
"name": "avas",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 10, "name"...
✅ Successfully exported AVAs!
   - File: ../../frontend/public/data/avas.geojson
   - Size: 3.48 MB
   - Features: 12
   - Sample content: {
"type": "FeatureCollection",
"name": "avas",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 10, "name"...


## Wineries

In [2]:
# Query to fetch all wineries with their geometry and related AVA information
wineries_query = """
SELECT 
    w.id,
    w.name,
    w.address,
    w.website_url,
    w.description,
    w.phone,
    w.ava_id,
    a.name as ava_name,
    ST_AsGeoJSON(w.location) as geometry,
    w.created_at,
    w.updated_at
FROM wineries w
LEFT JOIN avas a ON w.ava_id = a.id
WHERE w.location IS NOT NULL  -- Only include wineries with valid location data
ORDER BY w.name;
"""

print("Fetching wineries from database...")

# Execute query and create DataFrame
with engine.connect() as connection:
    df_wineries = pd.read_sql(wineries_query, connection)

print(f"Retrieved {len(df_wineries)} wineries from database")

# Convert to GeoDataFrame
from shapely.geometry import shape

# Parse the geometry from GeoJSON strings using shapely
geometries = df_wineries['geometry'].apply(
    lambda x: shape(json.loads(x)) if x else None
)

# Create GeoDataFrame
gdf_wineries = gpd.GeoDataFrame(
    df_wineries.drop('geometry', axis=1),
    geometry=geometries,
    crs='EPSG:4326'
)

print(f"Wineries data preview:")
print(f"   - Total wineries: {len(gdf_wineries)}")
print(f"   - Columns: {list(gdf_wineries.columns)}")
print(f"   - CRS: {gdf_wineries.crs}")
print(f"   - Geometry types: {gdf_wineries.geometry.geom_type.value_counts().to_dict()}")

print(f"\nFirst 3 wineries:")
for idx, row in gdf_wineries.head(3).iterrows():
    ava_info = f" ({row['ava_name']})" if row['ava_name'] else " (No AVA)"
    desc_length = len(row['description'] or 'No description')
    print(f"   - {row['name']}{ava_info}: {desc_length} chars description")

# Check for any wineries without location data
no_location_count = df_wineries['geometry'].isna().sum()
if no_location_count > 0:
    print(f"\nNote: {no_location_count} wineries in database have no location data and were excluded")

Fetching wineries from database...
Retrieved 180 wineries from database
Wineries data preview:
   - Total wineries: 180
   - Columns: ['id', 'name', 'address', 'website_url', 'description', 'phone', 'ava_id', 'ava_name', 'created_at', 'updated_at', 'geometry']
   - CRS: EPSG:4326
   - Geometry types: {'Point': 180}

First 3 wineries:
   - Aegrina Vineyard (McMinnville): 317 chars description
   - Aeolus Vineyards (Eola-Amity Hills): 385 chars description
   - Airlie Winery (Willamette Valley): 363 chars description


In [3]:
# Export wineries to GeoJSON file
wineries_output_path = output_dir / 'wineries.geojson'

print(f"Exporting wineries to: {wineries_output_path}")

# Export as GeoJSON
gdf_wineries.to_file(wineries_output_path, driver='GeoJSON')

# Verify the file was created and get file size
if wineries_output_path.exists():
    file_size_mb = wineries_output_path.stat().st_size / (1024 * 1024)
    print(f"Successfully exported wineries!")
    print(f"   - File: {wineries_output_path}")
    print(f"   - Size: {file_size_mb:.2f} MB")
    print(f"   - Features: {len(gdf_wineries)}")
    
    # Read a snippet to verify structure
    with open(wineries_output_path, 'r') as f:
        sample = f.read(300)
        print(f"   - Sample content: {sample}...")
else:
    print("Failed to create wineries GeoJSON file")

# Display summary of AVA distribution
print(f"\nWineries by AVA:")
ava_counts = gdf_wineries['ava_name'].fillna('No AVA assigned').value_counts()
for ava, count in ava_counts.head(10).items():
    print(f"   - {ava}: {count} wineries")
if len(ava_counts) > 10:
    print(f"   - ... and {len(ava_counts) - 10} more AVAs")

Exporting wineries to: ../../frontend/public/data/wineries.geojson
Successfully exported wineries!
   - File: ../../frontend/public/data/wineries.geojson
   - Size: 0.14 MB
   - Features: 180
   - Sample content: {
"type": "FeatureCollection",
"name": "wineries",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "id": 147, "name": "Aegrina Vineyard", "address": "11000 SW Handley Ln, McMinnville, OR 97128", "website_url": "...

Wineries by AVA:
   - Willamette Valley: 96 wineries
   - Eola-Amity Hills: 48 wineries
   - McMinnville: 11 wineries
   - Van Duzer: 8 wineries
   - Lower Long Tom: 7 wineries
   - Dundee Hills: 7 wineries
   - Mt Pisgah Polk County: 3 wineries


## Soils

In [1]:
import os
import json
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from pathlib import Path
from shapely.geometry import shape

dotenv_path = '../.env'
load_dotenv(dotenv_path=dotenv_path)

DATABASE_URL = os.getenv('DATABASE_URL')
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file.")

engine = create_engine(DATABASE_URL)

output_dir = Path('../../frontend/public/data/')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Database connection configured")
print(f"Output directory: {output_dir.absolute()}")
print("Setup complete.")

Database connection configured
Output directory: /home/tabernater/projects/VitisVeritas/backend/notebooks/../../frontend/public/data
Setup complete.


In [2]:
# First, let's see what counties are available in the database
counties_query = """
SELECT 
    county,
    COUNT(*) as polygon_count
FROM soils 
GROUP BY county
ORDER BY county;
"""

print("Checking available counties in database...")

with engine.connect() as connection:
    df_counties = pd.read_sql(counties_query, connection)

print(f"Available counties:")
for _, row in df_counties.iterrows():
    print(f"   - {row['county'].title()}: {row['polygon_count']} polygons")

# County selection (you can modify this to process specific counties)
COUNTIES_TO_PROCESS = df_counties['county'].tolist()  # Process all counties or specify specific counties: COUNTIES_TO_PROCESS = ['polk', 'marion']

print(f"\nWill process {len(COUNTIES_TO_PROCESS)} counties: {COUNTIES_TO_PROCESS}")

Checking available counties in database...
Available counties:
   - Benton: 6062 polygons
   - Clackamas: 10053 polygons
   - Lane: 36195 polygons
   - Linn: 20135 polygons
   - Marion: 10644 polygons
   - Multnomah: 2996 polygons
   - Polk: 14029 polygons
   - Washington: 12410 polygons
   - Yamhill: 11342 polygons

Will process 9 counties: ['benton', 'clackamas', 'lane', 'linn', 'marion', 'multnomah', 'polk', 'washington', 'yamhill']


In [3]:
def create_county_geojson(county_name, engine, output_dir):
    """
    Create a GeoJSON file for a specific county's soil data
    """
    print(f"\n{'='*50}")
    print(f"Processing {county_name.title()} County")
    print(f"{'='*50}")
    
    # Query to fetch soils for specific county
    county_soils_query = """
    SELECT 
        mukey,
        musym,
        muname,
        county,
        ST_AsGeoJSON(geom) as geometry
    FROM soils
    WHERE county = %(county)s
    ORDER BY mukey;
    """
    
    print(f"Fetching {county_name} soil data from database...")
    
    # Execute query with county parameter - using dictionary format
    with engine.connect() as connection:
        df_county_soils = pd.read_sql(county_soils_query, connection, params={'county': county_name})
    
    if len(df_county_soils) == 0:
        print(f"   WARNING: No soil data found for {county_name}")
        return None
    
    print(f"   Retrieved {len(df_county_soils)} soil polygons")
    
    # Display sample data
    print(f"   Sample soil data:")
    for idx, row in df_county_soils.head(3).iterrows():
        print(f"      - {row['mukey']} ({row['musym']}): {row['muname']}")
    
    # Convert to GeoDataFrame
    print(f"   Converting to GeoDataFrame...")
    
    # Parse the geometry from GeoJSON strings
    geometries = df_county_soils['geometry'].apply(
        lambda x: shape(json.loads(x)) if x else None
    )
    
    # Create GeoDataFrame
    gdf_county_soils = gpd.GeoDataFrame(
        df_county_soils.drop('geometry', axis=1),
        geometry=geometries,
        crs='EPSG:4326'
    )
    
    # Display basic info
    print(f"   GeoDataFrame created:")
    print(f"      - Total features: {len(gdf_county_soils)}")
    print(f"      - Unique soil types: {gdf_county_soils['mukey'].nunique()}")
    print(f"      - Geometry types: {gdf_county_soils.geometry.geom_type.value_counts().to_dict()}")
    
    # Check for any null geometries
    null_geoms = gdf_county_soils.geometry.isnull().sum()
    if null_geoms > 0:
        print(f"      - Warning: {null_geoms} null geometries found")
    
    # Export to county-specific GeoJSON file
    county_output_path = output_dir / f'{county_name}_soils.geojson'
    
    print(f"   Exporting to: {county_output_path}")
    
    # Export as GeoJSON
    gdf_county_soils.to_file(county_output_path, driver='GeoJSON')
    
    # Verify the file was created and get stats
    if county_output_path.exists():
        file_size_mb = county_output_path.stat().st_size / (1024 * 1024)
        print(f"   ✓ Successfully exported {county_name} soils GeoJSON!")
        print(f"      - File: {county_output_path.name}")
        print(f"      - Size: {file_size_mb:.2f} MB")
        print(f"      - Features: {len(gdf_county_soils)}")
        return county_output_path
    else:
        print(f"   ✗ Failed to create {county_name} GeoJSON file")
        return None

# Process each county
successful_exports = []
failed_exports = []

for county in COUNTIES_TO_PROCESS:
    try:
        result = create_county_geojson(county, engine, output_dir)
        if result:
            successful_exports.append(county)
        else:
            failed_exports.append(county)
    except Exception as e:
        print(f"   ✗ ERROR processing {county}: {e}")
        failed_exports.append(county)

print(f"\n{'='*60}")
print("PROCESSING SUMMARY")
print(f"{'='*60}")
print(f"Successfully processed: {len(successful_exports)} counties")
for county in successful_exports:
    print(f"   ✓ {county}")

if failed_exports:
    print(f"Failed to process: {len(failed_exports)} counties")
    for county in failed_exports:
        print(f"   ✗ {county}")


Processing Benton County
Fetching benton soil data from database...
   Retrieved 6062 soil polygons
   Sample soil data:
      - 1145663 (142): Sevencedars-Newanna complex, 60 to 90 percent slopes
      - 1145663 (142): Sevencedars-Newanna complex, 60 to 90 percent slopes
      - 1145663 (142): Sevencedars-Newanna complex, 60 to 90 percent slopes
   Converting to GeoDataFrame...
   GeoDataFrame created:
      - Total features: 6062
      - Unique soil types: 180
      - Geometry types: {'MultiPolygon': 6062}
   Exporting to: ../../frontend/public/data/benton_soils.geojson
   ✓ Successfully exported benton soils GeoJSON!
      - File: benton_soils.geojson
      - Size: 24.77 MB
      - Features: 6062

Processing Clackamas County
Fetching clackamas soil data from database...
   Retrieved 10053 soil polygons
   Sample soil data:
      - 2711076 (2012A): Waldo silty clay loam, 0 to 3 percent slopes
      - 2711077 (2310C): Woodburn silt loam, 3 to 12 percent slopes
      - 2711077 (2310C)

In [4]:
print("\nValidating generated GeoJSON files...")

# List all generated soil GeoJSON files
soil_files = list(output_dir.glob("*_soils.geojson"))

print(f"\nGenerated soil GeoJSON files ({len(soil_files)}):")

total_size = 0
total_features = 0

for file_path in sorted(soil_files):
    try:
        # Get file stats
        file_size_mb = file_path.stat().st_size / (1024 * 1024)
        total_size += file_size_mb
        
        # Read and validate GeoJSON structure
        with open(file_path, 'r') as f:
            geojson_data = json.load(f)
        
        feature_count = len(geojson_data.get('features', []))
        total_features += feature_count
        
        # Extract county name from filename
        county_name = file_path.stem.replace('_soils', '')
        
        print(f"   📄 {file_path.name}")
        print(f"      County: {county_name.title()}")
        print(f"      Size: {file_size_mb:.2f} MB")
        print(f"      Features: {feature_count}")
        
        # Check first feature structure
        if geojson_data.get('features'):
            first_feature = geojson_data['features'][0]
            properties = first_feature.get('properties', {})
            print(f"      Sample properties: {list(properties.keys())}")
            print(f"      Sample county: {properties.get('county', 'N/A')}")
        
        print()
        
    except Exception as e:
        print(f"   ✗ Error validating {file_path.name}: {e}")

print(f"Total generated files: {len(soil_files)}")
print(f"Total file size: {total_size:.2f} MB")
print(f"Total features across all files: {total_features}")


Validating generated GeoJSON files...

Generated soil GeoJSON files (9):
   📄 benton_soils.geojson
      County: Benton
      Size: 24.77 MB
      Features: 6062
      Sample properties: ['mukey', 'musym', 'muname', 'county']
      Sample county: benton

   📄 clackamas_soils.geojson
      County: Clackamas
      Size: 34.37 MB
      Features: 10053
      Sample properties: ['mukey', 'musym', 'muname', 'county']
      Sample county: clackamas

   📄 lane_soils.geojson
      County: Lane
      Size: 92.41 MB
      Features: 29787
      Sample properties: ['mukey', 'musym', 'muname', 'county']
      Sample county: lane

   📄 linn_soils.geojson
      County: Linn
      Size: 63.94 MB
      Features: 20135
      Sample properties: ['mukey', 'musym', 'muname', 'county']
      Sample county: linn

   📄 marion_soils.geojson
      County: Marion
      Size: 34.49 MB
      Features: 10644
      Sample properties: ['mukey', 'musym', 'muname', 'county']
      Sample county: marion

   📄 multnomah_

In [5]:
# Create an index file that lists all available county soil files
# This can be useful for your frontend to know what counties are available

county_index = {
    "counties": [],
    "generated_at": pd.Timestamp.now().isoformat(),
    "total_files": len(soil_files),
    "total_size_mb": round(total_size, 2)
}

for file_path in sorted(soil_files):
    county_name = file_path.stem.replace('_soils', '')
    file_size_mb = file_path.stat().st_size / (1024 * 1024)
    
    # Read feature count
    with open(file_path, 'r') as f:
        geojson_data = json.load(f)
    feature_count = len(geojson_data.get('features', []))
    
    county_info = {
        "name": county_name,
        "display_name": county_name.title(),
        "filename": file_path.name,
        "size_mb": round(file_size_mb, 2),
        "feature_count": feature_count
    }
    county_index["counties"].append(county_info)

# Save index file
index_path = output_dir / 'counties_index.json'
with open(index_path, 'w') as f:
    json.dump(county_index, f, indent=2)

print(f"Created county index file: {index_path}")
print(f"Index contains {len(county_index['counties'])} counties")

Created county index file: ../../frontend/public/data/counties_index.json
Index contains 9 counties
