# 02 - The Data Model
In this module we will explore the TEEHR data model and file formats.  The idea is that a scientist or researcher would format their data to the TEEHR data model and cache it for exploration and evaluation.  The data model is intended to be simple to understand and to get data into.

![data_model.png](../images/data_model.png)

NOTE: 

In [55]:
import duckdb
import pandas as pd
import geopandas as gpd
import hvplot.pandas
from pathlib import Path

# Explore the Data Model
Lets first specify an example timeseries, geospatial data, crosswalk data and attribute data cache file. We will explore each file one at a time, examining the data model, the Parquet schema, what it looks like when opened in Pandas, and ean examples of the data.

In [56]:
CACHE_DIR = Path(Path.home(), "cache")
STUDY_DIR = Path(CACHE_DIR, "post-event-example")
GEOMETRY = Path(STUDY_DIR, "geo", "usgs_geometry.parquet")
TIMESERIES = Path(STUDY_DIR, "timeseries", "short_range", "20221218T00Z.parquet")
CROSSWALK = Path(STUDY_DIR, "geo", "usgs_nwm22_crosswalk.parquet")
ATTRIBUTE = Path(STUDY_DIR, "geo", "usgs_attr_upstream_area.parquet")

### Geometry

In [57]:
duckdb.query(f"SELECT name, type, logical_type FROM parquet_schema('{GEOMETRY}')")

┌───────────────────┬────────────┬──────────────┐
│       name        │    type    │ logical_type │
│      varchar      │  varchar   │   varchar    │
├───────────────────┼────────────┼──────────────┤
│ schema            │ BOOLEAN    │ NULL         │
│ id                │ BYTE_ARRAY │ StringType() │
│ name              │ BYTE_ARRAY │ StringType() │
│ geometry          │ BYTE_ARRAY │ NULL         │
│ __index_level_0__ │ INT64      │ NULL         │
└───────────────────┴────────────┴──────────────┘

In [58]:
geom_gdf = gpd.read_parquet(GEOMETRY)
geom_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 7646 entries, 0 to 7645
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   id        7646 non-null   object  
 1   name      7646 non-null   object  
 2   geometry  7646 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 238.9+ KB


In [59]:
geom_gdf

Unnamed: 0,id,name,geometry
0,usgs-01010000,"St. John River at Ninemile Bridge, Maine",POINT (-69.71556 46.70056)
1,usgs-01010070,"Big Black River near Depot Mtn, Maine",POINT (-69.75167 46.89389)
2,usgs-01010500,"St. John River at Dickey, Maine",POINT (-69.08806 47.11306)
3,usgs-01011000,"Allagash River near Allagash, Maine",POINT (-69.07944 47.06972)
4,usgs-01011500,"St. Francis River near Connors, New Brunswick",POINT (-68.95643 47.20698)
...,...,...,...
7641,usgs-412453081395500,West Creek at Brooklyn Heights OH,POINT (-81.66528 41.41472)
7642,usgs-441624088045601,GLRI EAST RIVER WATERWAY NUMBER 1 NR GREENLEAF...,POINT (-88.08217 44.27336)
7643,usgs-480608115242901,"Libby Wetland Site bl Schrieber Lake nr Libby, MT",POINT (-115.40892 48.10223)
7644,usgs-08458000,"USIBW Rio Grande at Eagle Pass, TX",POINT (-100.50730 28.71414)


### Attribute

In [60]:
duckdb.query(f"SELECT name, type, logical_type FROM parquet_schema('{ATTRIBUTE}')")

┌─────────────────┬────────────┬──────────────┐
│      name       │    type    │ logical_type │
│     varchar     │  varchar   │   varchar    │
├─────────────────┼────────────┼──────────────┤
│ schema          │ BOOLEAN    │ NULL         │
│ location_id     │ BYTE_ARRAY │ StringType() │
│ attribute_value │ DOUBLE     │ NULL         │
│ attribute_name  │ BYTE_ARRAY │ StringType() │
│ attribute_unit  │ BYTE_ARRAY │ StringType() │
└─────────────────┴────────────┴──────────────┘

In [61]:
attr_df = pd.read_parquet(ATTRIBUTE)
attr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7715 entries, 0 to 7714
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location_id      7715 non-null   object 
 1   attribute_value  7715 non-null   float64
 2   attribute_name   7715 non-null   object 
 3   attribute_unit   7715 non-null   object 
dtypes: float64(1), object(3)
memory usage: 241.2+ KB


In [62]:
attr_df

Unnamed: 0,location_id,attribute_value,attribute_name,attribute_unit
0,usgs-11447850,60152.642390,upstream_area,km2
1,usgs-06298000,529.491930,upstream_area,km2
2,usgs-10189000,3116.665710,upstream_area,km2
3,usgs-02093877,10.734600,upstream_area,km2
4,usgs-01446000,99.921110,upstream_area,km2
...,...,...,...,...
7710,usgs-08331160,46032.279990,upstream_area,km2
7711,usgs-04192599,308.574892,upstream_area,km2
7712,usgs-04188100,907.723093,upstream_area,km2
7713,usgs-12413000,2326.103890,upstream_area,km2


### Crosswalk

In [63]:
duckdb.query(f"SELECT name, type, logical_type FROM parquet_schema('{CROSSWALK}')")

┌───────────────────────┬────────────┬──────────────┐
│         name          │    type    │ logical_type │
│        varchar        │  varchar   │   varchar    │
├───────────────────────┼────────────┼──────────────┤
│ schema                │ BOOLEAN    │ NULL         │
│ primary_location_id   │ BYTE_ARRAY │ StringType() │
│ secondary_location_id │ BYTE_ARRAY │ StringType() │
│ feature_id            │ INT64      │ NULL         │
└───────────────────────┴────────────┴──────────────┘

In [64]:
xwalk_df = pd.read_parquet(CROSSWALK)
xwalk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7647 entries, 9206 to 2776699
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   primary_location_id    7647 non-null   object
 1   secondary_location_id  7647 non-null   object
dtypes: object(2)
memory usage: 179.2+ KB


In [65]:
xwalk_df

Unnamed: 0_level_0,primary_location_id,secondary_location_id
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9206,usgs-05106000,nwm22-7086109
12736,usgs-05078520,nwm22-7040481
12777,usgs-05078470,nwm22-7053819
13372,usgs-05125039,nwm22-7111205
15635,usgs-05124982,nwm22-7110249
...,...,...
2775156,usgs-04263000,nwm22-15489152
2776119,usgs-04268000,nwm22-15465127
2776269,usgs-04265432,nwm22-15476223
2776275,usgs-04269000,nwm22-15456882


In [66]:
duckdb.query(f"SELECT name, type, logical_type FROM parquet_schema('{TIMESERIES}')")

┌───────────────────┬────────────┬─────────────────────────────────────────────────────────────────────────────────────┐
│       name        │    type    │                                    logical_type                                     │
│      varchar      │  varchar   │                                       varchar                                       │
├───────────────────┼────────────┼─────────────────────────────────────────────────────────────────────────────────────┤
│ schema            │ BOOLEAN    │ NULL                                                                                │
│ value_time        │ INT64      │ TimestampType(isAdjustedToUTC=0, unit=TimeUnit(MILLIS=<null>, MICROS=MicroSeconds…  │
│ location_id       │ BYTE_ARRAY │ StringType()                                                                        │
│ value             │ DOUBLE     │ NULL                                                                                │
│ measurement_unit  │ BYTE_ARRAY

In [67]:
ts_df = pd.read_parquet(TIMESERIES)
ts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137502 entries, 0 to 137645
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   value_time        137502 non-null  datetime64[ns]
 1   location_id       137502 non-null  object        
 2   value             137502 non-null  float64       
 3   measurement_unit  137502 non-null  object        
 4   reference_time    137502 non-null  datetime64[ns]
 5   configuration     137502 non-null  object        
 6   variable_name     137502 non-null  object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 8.4+ MB


In [68]:
ts_df

Unnamed: 0,value_time,location_id,value,measurement_unit,reference_time,configuration,variable_name
0,2022-12-18 01:00:00,nwm22-7086109,0.240000,m3/s,2022-12-18,short_range,streamflow
1,2022-12-18 01:00:00,nwm22-7040481,0.000000,m3/s,2022-12-18,short_range,streamflow
2,2022-12-18 01:00:00,nwm22-7053819,0.010000,m3/s,2022-12-18,short_range,streamflow
3,2022-12-18 01:00:00,nwm22-7111205,0.050000,m3/s,2022-12-18,short_range,streamflow
4,2022-12-18 01:00:00,nwm22-7110249,0.010000,m3/s,2022-12-18,short_range,streamflow
...,...,...,...,...,...,...,...
137641,2022-12-18 18:00:00,nwm22-15489152,51.529999,m3/s,2022-12-18,short_range,streamflow
137642,2022-12-18 18:00:00,nwm22-15465127,88.549998,m3/s,2022-12-18,short_range,streamflow
137643,2022-12-18 18:00:00,nwm22-15476223,22.469999,m3/s,2022-12-18,short_range,streamflow
137644,2022-12-18 18:00:00,nwm22-15456882,22.340000,m3/s,2022-12-18,short_range,streamflow
