# San Diego Epidemiology Tableau Data Extraction

This notebook extracts data from the San Diego County epidemiology Tableau dashboard using session-based API calls.
The extracted data is processed and stored for use in Dagster assets.

In [21]:
import sys
sys.path.append('../public/utils')

import requests
import pandas as pd
import os
import re
import json
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup
from tableau_extractor import TableauExtractor, extract_sandiego_epidemiology_data

print("✓ Imports successful")

✓ Imports successful


In [22]:
# Configuration - easily changeable for production
TABLEAU_DASHBOARD_URL = "https://public.tableau.com/views/DraftRespDash/RespDash"
TABLEAU_WORKBOOK_URL = "https://public.tableau.com/workbooks/DraftRespDash.twb"
TABLEAU_DOCUMENT_ID = "{6F324CCD-B1F2-4F80-AA86-8AD270C97348}"  # Points to time series data
DATA_DIR = Path("../../../data/sandiego_epideimilogy")
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR.absolute()}")
print(f"Dashboard URL: {TABLEAU_DASHBOARD_URL}")
print(f"Document ID: {TABLEAU_DOCUMENT_ID}")

Data directory: /Users/valentin/development/dev_earthcube/resilient_workflows_public/workflows/public/notebooks/../../../data/sandiego_epideimilogy
Dashboard URL: https://public.tableau.com/views/DraftRespDash/RespDash
Document ID: {6F324CCD-B1F2-4F80-AA86-8AD270C97348}


In [None]:
def extract_session_based_data(dashboard_url, document_id):
    """
    Extract data from Tableau dashboard using session-based API calls.
    Replicates the JavaScript approach provided in the prompt.
    """
    session = requests.Session()

    try:
        print("Step 1: Visiting dashboard to establish session...")
        response = session.get(dashboard_url)
        response.raise_for_status()
        print("✓ Successfully loaded dashboard")

        # Extract session key from page
        print("Step 2: Extracting session key...")
        session_key = None

        # Look for session key patterns
        patterns = [
            r'"sessionid":"([^"]+)"',
            r'"session":"([^"]+)"',
            r'sessionId["\']:\s*["\']([^"\']*)["\']'
        ]

        for pattern in patterns:
            match = re.search(pattern, response.text)
            if match:
                session_key = match.group(1)
                break

        if not session_key:
            print("✗ Could not find session key in page")
            return None

        print(f"✓ Found session key: {session_key[:10]}...")

        # Step 3: Get export key
        print("Step 3: Getting export key...")
        export_url = f"https://public.tableau.com/vizql/w/DraftRespDash/v/RespDash/sessions/{session_key}/commands/tabsrv/export-crosstab-to-csvserver"

        # Create multipart form data (exact format from JavaScript)
        boundary = "TLArn3Eh"
        body = f"""--{boundary}\r
Content-Disposition: form-data; name="sheetdocId"\r
\r
{document_id}\r
--{boundary}\r
Content-Disposition: form-data; name="useTabs"\r
\r
true\r
--{boundary}\r
Content-Disposition: form-data; name="sendNotifications"\r
\r
true\r
--{boundary}\r
Content-Disposition: form-data; name="telemetryCommandId"\r
\r
1j18vivat$6u0j-8k-ge-rg-t656ko\r
--{boundary}--\r
"""

        headers = {
            "accept": "text/javascript",
            "accept-language": "en-US,en;q=0.7",
            "content-type": f"multipart/form-data; boundary={boundary}",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "x-requested-with": "XMLHttpRequest",
            "x-tableau-version": "2025.2",
            "x-tsi-active-tab": "RespDash"
        }

        export_response = session.post(export_url, data=body, headers=headers)
        print(f"Export request status: {export_response.status_code}")

        if export_response.status_code != 200:
            print(f"✗ Export request failed: {export_response.text[:200]}...")
            return None

        # Parse JSON response to get export key
        try:
            response_data = export_response.json()
            export_key = response_data['vqlCmdResponse']['layoutStatus']['applicationPresModel']['presentationLayerNotification'][0]['presModelHolder']['genExportFilePresModel']['resultKey']
            print(f"✓ Got export key: {export_key[:10]}...")
        except (KeyError, IndexError) as e:
            print(f"✗ Could not parse export key from response: {e}")
            print(f"Response keys: {list(response_data.keys()) if 'response_data' in locals() else 'No response data'}")
            return None

        # Step 4: Download CSV data
        print("Step 4: Downloading CSV data...")
        csv_url = f"https://public.tableau.com/vizql/w/DraftRespDash/v/RespDash/tempfile/sessions/{session_key}/?key={export_key}&keepfile=yes&attachment=yes"

        csv_headers = {
            "accept": "*/*",
            "accept-language": "en-US,en;q=0.7",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "x-requested-with": "XMLHttpRequest",
            "x-tableau-version": "2025.2",
            "x-tsi-active-tab": "RespDash"
        }

        csv_response = session.get(csv_url, headers=csv_headers)

        if csv_response.status_code != 200:
            print(f"✗ CSV download failed: {csv_response.status_code}")
            return None

        print(f"✓ Successfully downloaded CSV ({len(csv_response.text)} chars)")

        # Parse CSV into DataFrame
        try:
            df = pd.read_csv(StringIO(csv_response.text))
            print(f"✓ Parsed CSV into DataFrame: {df.shape}")
            print(f"Columns: {list(df.columns)}")

            # Save CSV to file
            csv_file = DATA_DIR / "respiratory_surveillance_data.csv"
            df.to_csv(csv_file, index=False)
            print(f"✓ Saved to: {csv_file}")

            return df

        except Exception as e:
            print(f"✗ Error parsing CSV: {e}")
            # Save raw CSV for debugging
            raw_file = DATA_DIR / "raw_csv_response.txt"
            with open(raw_file, 'w') as f:
                f.write(csv_response.text)
            print(f"Raw response saved to: {raw_file}")
            return None

    except Exception as e:
        print(f"✗ Error in session extraction: {e}")
        import traceback
        traceback.print_exc()
        return None

# Test the session-based extraction
print("=== Testing Session-Based Data Extraction ===")
session_df = extract_session_based_data(TABLEAU_DASHBOARD_URL, TABLEAU_DOCUMENT_ID)

if session_df is not None:
    print(f"\n🎉 Success! Extracted {len(session_df)} rows")
    print("\nFirst few rows:")
    print(session_df.head())
else:
    print("\n❌ Session-based extraction failed")

## Method 1: Session-Based API Extraction

This method replicates the JavaScript approach provided in the prompt:
1. Visit dashboard page to establish session
2. Extract session key from the page
3. Use session to get export key via API
4. Download CSV data using the export key

## Method 2: Using Tableau Extractor Utility

Test the utility function that implements the same approach but with better error handling and multiple fallback methods.

In [None]:
print("=== Testing Tableau Extractor Utility ===")

try:
    # Use the main extraction function
    extracted_data, saved_files = extract_sandiego_epidemiology_data(
        dashboard_url=TABLEAU_DASHBOARD_URL,
        workbook_url=TABLEAU_WORKBOOK_URL,
        document_id=TABLEAU_DOCUMENT_ID,
        output_dir=str(DATA_DIR)
    )

    print(f"\\nExtraction completed!")
    print(f"Extracted {len(extracted_data)} datasets")
    print(f"Saved {len(saved_files)} files")

    if extracted_data:
        print("\\nDatasets found:")
        for name, df in extracted_data.items():
            if isinstance(df, pd.DataFrame):
                print(f"  {name}: {df.shape} - Columns: {list(df.columns)[:3]}...")

                # Show sample data for the first dataset
                if name == list(extracted_data.keys())[0]:
                    print(f"\\nSample data from {name}:")
                    print(df.head(3))

    if saved_files:
        print("\\nFiles saved:")
        for file_path in saved_files:
            if hasattr(file_path, 'stat'):
                size = file_path.stat().st_size
                print(f"  {file_path.name} ({size} bytes)")
            else:
                print(f"  {file_path}")

except Exception as e:
    print(f"❌ Error testing utility function: {e}")
    import traceback
    traceback.print_exc()

## Summary

This notebook demonstrates extracting data from the San Diego epidemiology Tableau dashboard using:

1. **Session-based API extraction** - Replicates the JavaScript approach
2. **Tableau Extractor utility** - Production-ready implementation with fallbacks

The extracted data contains respiratory surveillance information that will be processed by the Dagster asset for integration with other health monitoring systems.