# San Diego Epidemiology & Immunization Services Data Extraction

This notebook demonstrates downloading a Tableau workbook from Tableau Public and extracting data using the Tableau Hyper API.

In [24]:
import requests
import zipfile
import xml.etree.ElementTree as ET
from tableauhyperapi import HyperProcess, Connection, Telemetry
import pandas as pd
import os
from pathlib import Path
import json

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [25]:
# Configuration
WORKBOOK_URL = "https://public.tableau.com/workbooks/DraftRespDash.twb"
DATA_DIR = Path("../../../data/sandiego_epideimilogy")
DATA_DIR.mkdir(parents=True, exist_ok=True)

workbook_path = DATA_DIR / "DraftRespDash.twb"
extracted_dir = DATA_DIR / "extracted"
extracted_dir.mkdir(exist_ok=True)

print(f"Data directory: {DATA_DIR.absolute()}")
print(f"Workbook URL: {WORKBOOK_URL}")

Data directory: /Users/valentin/development/dev_resilient/resilient_workflows_public/workflows/public/notebooks/../../../data/sandiego_epideimilogy
Workbook URL: https://public.tableau.com/workbooks/DraftRespDash.twb


In [26]:
# Download the workbook
print("Downloading Tableau workbook...")
response = requests.get(WORKBOOK_URL)
response.raise_for_status()

with open(workbook_path, 'wb') as f:
    f.write(response.content)

print(f"✓ Workbook downloaded: {workbook_path} ({len(response.content)} bytes)")

Downloading Tableau workbook...
✓ Workbook downloaded: ../../../data/sandiego_epideimilogy/DraftRespDash.twb (53707 bytes)


In [27]:
# Extract the workbook (it's a zip file)
print("Extracting workbook contents...")
with zipfile.ZipFile(workbook_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)
    file_list = zip_ref.namelist()

print(f"✓ Extracted {len(file_list)} files:")
for file in file_list:
    print(f"  - {file}")

Extracting workbook contents...
✓ Extracted 3 files:
  - Data/Extracts/Demo_Percent+ (COVID Flu RSV Tableau Output_2).hyper
  - Image/New Color Bar.png
  - DraftRespDash.twb


In [28]:
# Examine XML structure
twb_file = extracted_dir / "DraftRespDash.twb"
if twb_file.exists():
    print("Analyzing workbook XML structure...")
    tree = ET.parse(twb_file)
    root = tree.getroot()

    print(f"Root element: {root.tag}")
    print("\nTop-level elements:")
    for child in root:
        print(f"  - {child.tag}: {len(list(child))} children")

    # Look for data sources
    datasources = root.findall('.//datasource')
    print(f"\nFound {len(datasources)} data sources:")
    for ds in datasources:
        name = ds.get('name', 'Unknown')
        print(f"  - {name}")

Analyzing workbook XML structure...
Root element: workbook

Top-level elements:
  - document-format-change-manifest: 18 children
  - repository-location: 0 children
  - preferences: 2 children
  - datasources: 1 children
  - worksheets: 10 children
  - dashboards: 2 children
  - windows: 12 children

Found 13 data sources:
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d
  - federated.0awy2xn0o041gc17mlr2408o222d


In [29]:
# Look for Hyper files
hyper_files = list(extracted_dir.glob('**/*.hyper'))
print(f"Found {len(hyper_files)} Hyper files:")
for hyper_file in hyper_files:
    print(f"  - {hyper_file.relative_to(extracted_dir)}")
    print(f"    Size: {hyper_file.stat().st_size} bytes")

Found 1 Hyper files:
  - Data/Extracts/Demo_Percent+ (COVID Flu RSV Tableau Output_2).hyper
    Size: 131072 bytes


In [46]:
# Extract data from Hyper files
extracted_data = {}

for hyper_file in hyper_files:
    print(f"\nProcessing: {hyper_file.name}")
    path_to_hyper = hyper_file.absolute().resolve()
    if not path_to_hyper.exists():
        print(f"  Hyper file not found: {path_to_hyper}")
        continue
    try:
        with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
            with Connection(endpoint=hyper.endpoint, database=str(path_to_hyper)) as connection:
                # List all tables
                tables = connection.catalog.get_table_names("Extract")
                print(f"  Tables found: {len(tables)}")

                for table in tables:
                    print(f"    Reading table: {table}")

                    table_definition = connection.catalog.get_table_definition(name=table)
                    print(f"Table {table.name} has qualified name: {table}")

                    for column in table_definition.columns:
                         print(f"Column {column.name} has type={column.type} and nullability={column.nullability}")
                    print("")

                    # Query all rows
                    result = connection.execute_list_query(f"SELECT * FROM {table}")
                    rows = list(result)

                    if rows:
                        # Get column names
                        # columns_result = connection.execute_list_query(
                        #     f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table.name}'"
                        # )
                        # columns = [row[0] for row in columns_result]

                        # Create DataFrame
                        columns = [col.name for col in table_definition.columns]
                        df = pd.DataFrame(rows, columns=columns)
                       # table_key = f"{hyper_file.stem}_{table.name}"
                        table_key = f"{table.name}"
                        extracted_data[table_key] = df

                        print(f"      Extracted {len(df)} rows, {len(df.columns)} columns")
                        print(f"      Columns: {list(df.columns)[:5]}...")
                    else:
                        print(f"      No data found in table {table}")

    except Exception as e:
        print(f"  Error processing {hyper_file.name}: {e}")


Processing: Demo_Percent+ (COVID Flu RSV Tableau Output_2).hyper
  Tables found: 3
    Reading table: "Extract"."Time Series_0427D0184F5A45B7973E2512848A1EE4"
Table "Time Series_0427D0184F5A45B7973E2512848A1EE4" has qualified name: "Extract"."Time Series_0427D0184F5A45B7973E2512848A1EE4"
Column "FY" has type=TEXT and nullability=Nullability.NULLABLE
Column "CDCWk" has type=BIG_INT and nullability=Nullability.NULLABLE
Column "WkNum" has type=BIG_INT and nullability=Nullability.NULLABLE
Column "WkStart" has type=DATE and nullability=Nullability.NULLABLE
Column "Disease" has type=TEXT and nullability=Nullability.NULLABLE
Column "Metric" has type=TEXT and nullability=Nullability.NULLABLE
Column "Count" has type=BIG_INT and nullability=Nullability.NULLABLE
Column "Rate" has type=DOUBLE and nullability=Nullability.NULLABLE
Column "ThisWk" has type=BIG_INT and nullability=Nullability.NULLABLE
Column "LastWk" has type=BIG_INT and nullability=Nullability.NULLABLE
Column "Update Date" has type=

In [43]:
# Display summary of extracted data
print(f"\n=== EXTRACTION SUMMARY ===")
print(f"Total datasets extracted: {len(extracted_data)}")

for name, df in extracted_data.items():
    print(f"\nDataset: {name}")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Sample data:")
    print(df.head(3).to_string(max_cols=5))


=== EXTRACTION SUMMARY ===
Total datasets extracted: 3

Dataset: Demo_Percent+ (COVID Flu RSV Tableau Output_2)_"Time Series_0427D0184F5A45B7973E2512848A1EE4"
  Shape: (1688, 14)
  Columns: [Name('FY'), Name('CDCWk'), Name('WkNum'), Name('WkStart'), Name('Disease'), Name('Metric'), Name('Count'), Name('Rate'), Name('ThisWk'), Name('LastWk'), Name('Update Date'), Name('End Date'), Name('WkStrtActual'), Name('WkEndActual')]
  Sample data:
    "FY"  "CDCWk"  ...  "WkStrtActual" "WkEndActual"
0  23_24       19  ...      2024-05-05    2024-05-11
1  23_24       20  ...      2024-05-12    2024-05-18
2  23_24       21  ...      2024-05-19    2024-05-25

Dataset: Demo_Percent+ (COVID Flu RSV Tableau Output_2)_"Demo!Percent_D2C90B937EC749F59B0EDE90A43F1138"
  Shape: (32, 6)
  Columns: [Name('Metric'), Name('Category'), Name('Subcategory'), Name('COVID'), Name('Flu'), Name('RSV')]
  Sample data:
  "Metric" "Category"  ...     "Flu"     "RSV"
0    Cases     AgeCat  ...  0.082952  0.558297
1    Ca

In [45]:
# Save extracted data
output_dir = DATA_DIR / "output"
output_dir.mkdir(exist_ok=True)

saved_files = []

for name, df in extracted_data.items():
    # Save as CSV
    csv_path = output_dir / f"{name}.csv"
    df.to_csv(csv_path, index=False)
    saved_files.append(csv_path)

    # Save as JSON
    json_path = output_dir / f"{name}.json"
    df.to_json(json_path, orient='records', date_format='iso')
    saved_files.append(json_path)

    print(f"✓ Saved {name}: CSV ({csv_path.stat().st_size} bytes), JSON ({json_path.stat().st_size} bytes)")

print(f"\n✓ All data saved to: {output_dir}")
print(f"Total files created: {len(saved_files)}")

✓ Saved Demo_Percent+ (COVID Flu RSV Tableau Output_2)_"Time Series_0427D0184F5A45B7973E2512848A1EE4": CSV (169456 bytes), JSON (978121 bytes)
✓ Saved Demo_Percent+ (COVID Flu RSV Tableau Output_2)_"Demo!Percent_D2C90B937EC749F59B0EDE90A43F1138": CSV (1993 bytes), JSON (4673 bytes)
✓ Saved Demo_Percent+ (COVID Flu RSV Tableau Output_2)_"Diff!Table_12C8CA32225C40BEA8180970AB9D03C1": CSV (614 bytes), JSON (2797 bytes)

✓ All data saved to: ../../../data/sandiego_epideimilogy/output
Total files created: 6
