In [1]:
"""
GENERAL STEPS
- run sql query to get all non-spatial TMC data and load into dataframe
- load shp of NHS segments into geodataframe
- left join non-spatial TMC table to NHS geotable
- where spatial data missing:
    - add "true_shp" flag; set to 0 (1 = has true shape, 0 = needs true shape built)
    - compute "stick" geometry using start/end lat/long vals.
    - CHECK TO DO: can you programmatically check if a TMC's start/end points haven't changed, and if not just plug in the geom from the 2021 INRIX file?
 - export to feature class; DAMS will need to manually correct missing true shapes (how to prioritize?)
    
"""
import datetime as dt
from time import perf_counter
from pathlib import Path

import numpy as np
import geopandas as gpd
import pandas as pd
import arcpy
from arcgis.features import GeoAccessor, GeoSeriesAccessor


from utils import esri_to_df, pts_to_linestring
from sqlqry2pandas import sqlqry_to_df


#=========INPUTS=========================
# true-shape TMCs, but only for NHS
shp_tmc_nhs = r"I:\Projects\Darren\PPA3_GIS\PPA3_GIS.gdb\NPMRDS_2023_NHS_SACOG" # on WIN10-MODEL-2

# non-spatial data stored in SQL Server
# WARNING - running query takes 20+mins
npmrds_db = 'NPMRDS'
tt_tbl = 'npmrds_2023_alltmc_paxtruck_comb'
tmc_txt_tbl = 'npmrds_2023_alltmc_txt' # full TMC network, but lacking spatial true-shape data
data_year = 2023

# OPTION to load directly from CSV rather than running long (20min+) query, if CSV available
use_speed_csv = False
# csv_speed_data = r"I:\Projects\Darren\PPA3_GIS\CSV\NPMRDS\npmrds_metrics_2023.csv"



#=================BEGIN LOADING=========================
# load shp to gdf
shp_fields = ['Tmc']
crs_sacog_ft = 2226 # SACOG CRS with feet as units
shp_native_crs = arcpy.Describe(shp_tmc_nhs).spatialReference.factoryCode
gdf_nhs = esri_to_df(esri_obj_path=shp_tmc_nhs, include_geom=True, field_list=shp_fields, index_field=None, 
               crs_val=crs_sacog_ft, dissolve=False)

if gdf_nhs.crs.to_epsg() != crs_sacog_ft:
    gdf_nhs = gdf_nhs.to_crs(f"EPSG:{crs_sacog_ft}")

print('gdf loaded')

# load speed data
print("loading speed data...")
if use_speed_csv:
    df_npmrds = pd.read_csv(csv_speed_data)
else:
    st = perf_counter()
    with open('PPA_NPMRDS_metrics_latest.sql', 'r') as f:
        sql_template = f.read()

        params = dict(tt_tbl=tt_tbl, tmc_tbl=tmc_txt_tbl)
        formatted_sql = sql_template.format(**params)

    # run query and load to spatial df
    df_npmrds = sqlqry_to_df(query_str=formatted_sql, dbname=npmrds_db) 
    elapsed = round((perf_counter() - st) / 60, 1)
    print(f"NPMRDS query completed in {elapsed} mins.")

print('speed data loaded')
    
# left join non-spatial TMC table to NHS geotable
f_trushp = 'tru_shp_yr'
dfjn = df_npmrds.merge(gdf_nhs, how='left', left_on='tmc', right_on='Tmc')
dfjn = gpd.GeoDataFrame(dfjn, geometry='geometry')
dfjn[f_trushp] = 0 # by default, assume not true shape
dfjn.loc[~dfjn['geometry'].isnull(), f_trushp] = data_year # set to 1 if TMC exists in NHS shapefile
print('all data loaded')

gdf loaded
loading speed data...
Executing query. Results loading into dataframe...
Successfully executed query in 20.87 minutes. 7677 rows loaded into dataframe.
NPMRDS query completed in 20.9 mins.
speed data loaded
all data loaded


In [2]:
# option to export query results to CSV so you don't have to re-run query every time.
csv_speed_data = r"I:\Projects\Darren\PPA3_GIS\CSV\NPMRDS\npmrds_metrics_2023_20241219.csv"
df_npmrds.to_csv(csv_speed_data, index=False)

In [21]:
# where you can, repurpose geographies from the 2021 SHP from Inrix to reduce amount of manual coding needed
# 10/22/2024 - make this more generalized function (e.g., take in some old true-shp file, then use TMCs from it where you can;
# and can specify tolerance for using old shape)

#================== INPUTS FOR OLD TMC WHOSE SHAPES YOU WANT TO TRY AND REPURPOSE===========
shp_old_tmcs = r'I:\Projects\Darren\PPA3_GIS\PPA3.0_archive.gdb\INRIX_SHP_2020_2021_SACOG'
old_tmc_yr = 2021

#================RUN PROCESS TO INSERT GEOM FROM OLDER TMC version where possible===============

def insert_links(in_gdf, shps_to_insert, insert_links_yr, dissolve_insert_shps_field, tolerance=0.0017):
    from shapely import get_point
    
    f_gdflat1, f_gdflon1 = 'start_latitude', 'start_longitude'
    f_gdflat2, f_gdflon2 = 'end_latitude', 'end_longitude'
    
    # load shp of links you want to insert
    inslink_shp_fields = ['Tmc']
    insert_links_crs = arcpy.Describe(shps_to_insert).spatialReference.factoryCode # 2226 = EPSG code for SACOG region
    gdf_insert_links = esri_to_df(esri_obj_path=shps_to_insert, include_geom=True, field_list=inslink_shp_fields, index_field=None, 
                   crs_val=f"EPSG:{insert_links_crs}", dissolve=False)

    # merge to master df with speed data
    gdf_insert_links_prj = gdf_insert_links.to_crs(f"EPSG:{crs_sacog_ft}")

    # 10/17/1014 intentional duplicate of above to_crs() command due to know recent bug with geopandas.
    # more details - https://github.com/geopandas/geopandas/issues/3433
    gdf_insert_links_prj = gdf_insert_links.to_crs(f"EPSG:{crs_sacog_ft}")
    
    
    # get start/end points for links to insert
    f_start_insertlink = 'startpt_inslink'
    f_end_insertlink = 'endpt_inslink'
    gdf_insert_links_prj = gdf_insert_links_prj.dissolve(by=dissolve_insert_shps_field) \
                        .reset_index() # need dissolve to get rid of multipart geometries before getting end points.
    
    gdf_insert_links_prj[f_start_insertlink] = get_point(gdf_insert_links_prj.geometry, 0)
    gdf_insert_links_prj[f_end_insertlink] = get_point(gdf_insert_links_prj.geometry, -1)

    # merge input gdf to gdf of links whose geoms you want to insert where possible
    in_gdf = in_gdf.merge(gdf_insert_links_prj, how='left', left_on='tmc', right_on='Tmc', suffixes=('', f'_{insert_links_yr}'))


    # compute difference between 2021 vs. latest TMCs' "stick" distance 
    f_spnew, f_epnew = 'start_pt_new', 'end_pt_new'
    update_dict = {f_spnew: [f_gdflon1, f_gdflat1],
                   f_epnew: [f_gdflon2, f_gdflat2]}

    for cname, fields in update_dict.items():
        in_gdf[cname] = gpd.points_from_xy(in_gdf[fields[0]], in_gdf[fields[1]], crs="EPSG:4326").to_crs(f"EPSG:{crs_sacog_ft}")

    # compute diff in distance between start-end pts from new TMC vsl start-end pts of tmc link you want to insert
    f_abs_dist_dif = 'abs_dist_dif'
    in_gdf[f_abs_dist_dif] = abs(np.sqrt((in_gdf[f_epnew].x - in_gdf[f_spnew].x)**2 + (in_gdf[f_epnew].y - in_gdf[f_spnew].y)**2)
                        - np.sqrt((in_gdf[f_end_insertlink].x - in_gdf[f_start_insertlink].x)**2 + (in_gdf[f_end_insertlink].y - in_gdf[f_start_insertlink].y)**2))

    dist_units = in_gdf.crs.to_dict()['units']
    if in_gdf.crs.to_dict()['units'] != 'us-ft':
        print(f'WARNING: distance units are in {dist_units}')

    # if not already a tru shp from NHS *and* distance difference between old and new lat/longs is < 1 foot, then sub in the 2021 geometry
    # want tight tolerance to minimize gaps or overlapping TMC segments.
    in_gdf.loc[(in_gdf[f_trushp] == 0) & (in_gdf[f_abs_dist_dif] < 1), 'geometry'] = in_gdf[f'geometry_{insert_links_yr}']
    in_gdf.loc[(in_gdf[f_trushp] == 0) & (in_gdf[f_abs_dist_dif] < 1), f_trushp] = insert_links_yr # set to 2 to reflect TMCs that came from 2021

    in_gdf.loc[(in_gdf[f_trushp] == 0), 'geometry'] = in_gdf.apply(lambda x: pts_to_linestring(x, 'start_pt_new', 'end_pt_new'), axis=1)

    f_pctdiff_v_trushp = 'pctdiff_v_trushp'
    in_gdf[f_pctdiff_v_trushp] = 0 # 0 = link will need manual edit
    
    # if the "stick" length is sufficiently close to the spec file miles, then mark as not needing manual edit
    # specifically, if the real length is less than 0.15% different from stick, then consider as not needing manual edit
    # (because the "true" shape is a straight line)
    in_gdf.loc[(in_gdf[f_trushp] == 0), f_pctdiff_v_trushp] = abs(in_gdf['miles'] - (in_gdf['geometry'].length/5280)) / in_gdf['miles']
    in_gdf.loc[(in_gdf[f_trushp] == 0) & (in_gdf[f_pctdiff_v_trushp] <= tolerance), f_trushp] = data_year
    
    # delete unneeded columns
    uneeded_geo_cols = [f for f in in_gdf.columns if in_gdf[f].dtype.name == 'geometry' and f != 'geometry']
    fields_to_delete = ['start_longitude', 'start_latitude', 'end_longitude', 'end_latitude', 
                       *uneeded_geo_cols]
    for f in fields_to_delete: del in_gdf[f]
    
    return in_gdf
    
dfjn = insert_links(in_gdf=dfjn, shps_to_insert=shp_old_tmcs, insert_links_yr=old_tmc_yr, dissolve_insert_shps_field='Tmc')
dfjn[f_trushp].value_counts()


  proj = self._crs.to_proj4(version=version)


tru_shp_yr
2021    3665
2023    3080
0        932
Name: count, dtype: int64

In [22]:
# export to ESRI feature class
sufx = str(dt.datetime.now().strftime('%Y%m%d_%H%M'))
outname = f"NPMRDS_{data_year}data_{sufx}"
out_gdb = r'I:\Projects\Darren\PPA3_GIS\PPA3_GIS.gdb'
out_path = str(Path(out_gdb).joinpath(outname))


fields_to_delete = ['tmc_appearance_n',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'Tmc', 'Tmc_2021', 'StartLat', 'StartLong',
       'EndLat', 'EndLong', 'geometry_2021', 'start_pt_new', 'end_pt_new',
       'start_pt_old', 'end_pt_old']


sedf = pd.DataFrame.spatial.from_geodataframe(dfjn)
output_fields = [f for f in sedf.columns if f not in fields_to_delete]
sedf = sedf[output_fields]
print(sedf.spatial.to_featureclass(out_path, sanitize_columns=False))

'I:\\Projects\\Darren\\PPA3_GIS\\PPA3_GIS.gdb\\NPMRDS_2023data_20241024_1051'

In [23]:
print('I:\\Projects\\Darren\\PPA3_GIS\\PPA3_GIS.gdb\\NPMRDS_2023data_20241024_1051')

I:\Projects\Darren\PPA3_GIS\PPA3_GIS.gdb\NPMRDS_2023data_20241024_1051
