# 99a - Script 21 nodes output comparison

This notebook compares times by osmid for the original proxanalysis (2020) the Script 21 output for Aguascalientes.

* First result was successful but not perfect. After analysing the small difference node by node, the result was successful. The difference is due to very very very (e-13) small decimals.
* Also, there are differences in the amount of nodes. Tests indicate that aup.graph_from_hippo can still generate disconected networks.
* * This could be easly solved by deleting rows that have nan values in all examined amenities.

## Import libraries

In [2]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

  ox.config(


## Load data

### Load data - Load original data (Already in database)

In [3]:
city = 'Aguascalientes'
schema = 'prox_analysis'
table = 'nodes_proximity_2020'
query = f"SELECT * FROM {schema}.{table} WHERE \"metropolis\" LIKE \'{city}\'"
nodes_gdf = aup.gdf_from_query(query, geometry_col='geometry')

# Show
print(nodes_gdf.shape)
nodes_gdf.head(1)

(2208953, 7)


Unnamed: 0,osmid,x,y,geometry,time,amenity,metropolis
0,272921360,-102.295073,21.872876,POINT (-102.29507 21.87288),12.607833,sip_centro_admin,Aguascalientes


In [4]:
# Transform original nodes data (rows to columns)
nodes_analysis = nodes_gdf.drop_duplicates(subset='osmid', keep="last")[['osmid','geometry','metropolis']].copy()
for amenidad in list(nodes_gdf.amenity.unique()):
    nodes_tmp = nodes_gdf.loc[nodes_gdf.amenity == amenidad,['osmid','time']]
    nodes_tmp = nodes_tmp.rename(columns={'time':amenidad})

    if nodes_tmp[amenidad].mean() == 0:
        nodes_tmp[amenidad] = np.nan

    nodes_analysis = nodes_analysis.merge(nodes_tmp, on='osmid')

# Show
print(nodes_analysis.shape)
nodes_analysis.head(1)

(51371, 46)


Unnamed: 0,osmid,geometry,metropolis,sip_centro_admin,sip_teatro,sip_cancha,sip_unidad_deportiva,sip_espacio_publico,sip_mercado,clues_primer_nivel,...,denue_cines,denue_centro_cultural,denue_parque_natural,denue_papelerias,denue_libros,denue_revistas_periodicos,denue_ferreteria_tlapaleria,denue_art_limpieza,denue_pintura,denue_peluqueria
0,301187451,POINT (-102.25150 21.87168),Aguascalientes,0.764097,54.641889,151.583113,55.092683,5.590739,27.916878,1.911369,...,22.428319,11.485214,,4.332055,49.528436,15.744918,12.132738,1.172275,5.034638,5.654589


In [5]:
# There are no parques naturales in Aguascalientes
nodes_analysis.denue_parque_natural.unique()

array([nan])

## Load data - Load prox script 21 data (Aguascalientes, version 1)

In [6]:
# test
dir = "../../../data/processed/proximity_v2/test_ags_proxanalysis_scriptv1_nodes.gpkg"
nodes_test = gpd.read_file(dir)

# Show
print(nodes_test.shape)
nodes_test.head(1)

(51434, 34)


Unnamed: 0,osmid,denue_preescolar,denue_primaria,denue_secundaria,clues_primer_nivel,denue_guarderias,denue_dif,denue_supermercado,denue_abarrotes,denue_carnicerias,...,denue_cafe,sip_cancha,sip_unidad_deportiva,sip_espacio_publico,denue_parque_natural,denue_cines,denue_museos,x,y,geometry
0,272921360,10.770629,5.615674,11.812107,6.486561,5.09561,6.486561,3.592175,1.673421,6.486561,...,5.25473,156.885198,28.44305,5.692959,,26.069111,7.589368,-102.295073,21.872876,POINT (-102.29507 21.87288)


## Compare data

In [7]:
# Create amenities list which will be compared
amenities_list = list(nodes_test.columns)
amenities_list.remove('osmid')
amenities_list.remove('x')
amenities_list.remove('y')
amenities_list.remove('geometry')
amenities_list

['denue_preescolar',
 'denue_primaria',
 'denue_secundaria',
 'clues_primer_nivel',
 'denue_guarderias',
 'denue_dif',
 'denue_supermercado',
 'denue_abarrotes',
 'denue_carnicerias',
 'sip_mercado',
 'denue_peluqueria',
 'denue_farmacias',
 'denue_ferreteria_tlapaleria',
 'denue_art_limpieza',
 'denue_ropa',
 'denue_calzado',
 'denue_muebles',
 'denue_lavanderia',
 'denue_revistas_periodicos',
 'denue_pintura',
 'denue_restaurante_insitu',
 'denue_restaurante_llevar',
 'denue_bares',
 'denue_cafe',
 'sip_cancha',
 'sip_unidad_deportiva',
 'sip_espacio_publico',
 'denue_parque_natural',
 'denue_cines',
 'denue_museos']

In [64]:
# Inner merge (to compare same osmids)
compare = nodes_analysis.merge(nodes_test,on='osmid',how='inner')

# Compare old and new amenities
compare_list = []
for amenity in amenities_list:

    old_amenity = f"{amenity}_x"
    new_amenity = f"{amenity}_y"
    compare[f"{amenity}_diff"] = compare[new_amenity] - compare[old_amenity]

    compare_list.append(f"{amenity}_diff")

# Save df with time differences only
compare_diff = compare[compare_list]

# Visualize sum of time differences
summary = pd.DataFrame()
i = 0
for compare_amenity in compare_list:

    diff_value = compare[compare_amenity].sum()
    summary.loc[i,'amenity'] = compare_amenity
    summary.loc[i,'diff'] = diff_value

    i = i+1

# Show
# denue_parque_natural_diff is 0 because there are no parques naturales in Aguascalientes
summary

Unnamed: 0,amenity,diff
0,denue_preescolar_diff,-7.665077e-12
1,denue_primaria_diff,-1.437461e-12
2,denue_secundaria_diff,1.616421e-11
3,clues_primer_nivel_diff,-6.788167e-12
4,denue_guarderias_diff,-6.542461e-12
5,denue_dif_diff,-1.041363e-11
6,denue_supermercado_diff,-2.764117e-11
7,denue_abarrotes_diff,-8.497182e-12
8,denue_carnicerias_diff,-2.564849e-11
9,sip_mercado_diff,-1.812901e-11


## Find difference in nodes time value

Successful: the difference in times is due to small decimals (e-13 mins at least) in each node.

In [59]:
# How much is the difference between old and new time by node?
differences_to_analyse = [1,0.000000000001,0.0000000000001,0.00000000000001,0.000000000000001]

nodes_diff_df = pd.DataFrame()

for time in differences_to_analyse:
    i = 0
    for diff_col in compare_list:
        nodes_diff_df.loc[i,'amenity'] = diff_col
        
        nodes_with_positive_diff = compare.loc[(compare[diff_col]>time)].shape[0]
        nodes_with_negative_diff = compare.loc[(compare[diff_col]<-time)].shape[0]

        nodes_diff_df.loc[i,f"{time}mins_diff"] = nodes_with_positive_diff
        nodes_diff_df.loc[i,f"{-time}mins_diff"] = nodes_with_negative_diff

        i = i+1

# There are 51,371 nodes
nodes_diff_df

Unnamed: 0,amenity,1mins_diff,-1mins_diff,1e-12mins_diff,-1e-12mins_diff,1e-13mins_diff,-1e-13mins_diff,1e-14mins_diff,-1e-14mins_diff,1e-15mins_diff,-1e-15mins_diff
0,denue_preescolar_diff,0.0,0.0,0.0,0.0,0.0,0.0,262.0,430.0,3397.0,3315.0
1,denue_primaria_diff,0.0,0.0,0.0,0.0,0.0,0.0,373.0,360.0,3937.0,4239.0
2,denue_secundaria_diff,0.0,0.0,0.0,0.0,0.0,0.0,1054.0,495.0,6649.0,4463.0
3,clues_primer_nivel_diff,0.0,0.0,0.0,0.0,0.0,0.0,434.0,877.0,4070.0,4585.0
4,denue_guarderias_diff,0.0,0.0,0.0,0.0,0.0,0.0,257.0,576.0,3856.0,4394.0
5,denue_dif_diff,0.0,0.0,0.0,0.0,0.0,4.0,475.0,636.0,3910.0,4022.0
6,denue_supermercado_diff,0.0,0.0,0.0,0.0,0.0,0.0,461.0,1517.0,4270.0,6266.0
7,denue_abarrotes_diff,0.0,0.0,0.0,0.0,0.0,0.0,144.0,370.0,1973.0,1814.0
8,denue_carnicerias_diff,0.0,0.0,0.0,0.0,8.0,0.0,321.0,837.0,2992.0,3935.0
9,sip_mercado_diff,0.0,0.0,0.0,0.0,3.0,45.0,2245.0,2854.0,4850.0,7525.0


## Find difference in amount of nodes on both gdfs

* nodes_analysis (original, from db) has __51,371__ unique nodes for Aguascalientes
* nodes_test (Script 21) has more, __51,434__ unique nodes.
* Why?                                                

In [16]:
# FIND NODES ON NODES ANALYSIS THAT ARE NOT IN NODES_TEST
# Find how osmids relate (through indicator)
missing_nodes_1 = nodes_analysis.merge(nodes_test, on='osmid', how='left', indicator=True)
# Select nodes which are in left_gdf only ('left_only')
missing_nodes_1 = missing_nodes_1.loc[missing_nodes_1['_merge']=='left_only']
missing_nodes_1 = missing_nodes_1.drop(columns=['_merge'])
missing_nodes_1

Unnamed: 0,osmid,geometry_x,metropolis,sip_centro_admin,sip_teatro,sip_cancha_x,sip_unidad_deportiva_x,sip_espacio_publico_x,sip_mercado_x,clues_primer_nivel_x,...,denue_cafe_y,sip_cancha_y,sip_unidad_deportiva_y,sip_espacio_publico_y,denue_parque_natural_y,denue_cines_y,denue_museos_y,x,y,geometry_y


In [29]:
# FIND NODES ON NODES_TEST ANALYSIS THAT ARE NOT IN NODES ANALYSIS
# Find how osmids relate (through indicator)
missing_nodes_2 = nodes_test.merge(nodes_analysis, on='osmid', how='left', indicator=True)
# Select nodes which are in left_gdf only ('left_only')
missing_nodes_2 = missing_nodes_2.loc[missing_nodes_2['_merge']=='left_only']
missing_nodes_2 = missing_nodes_2.drop(columns=['_merge'])

#Show
print(missing_nodes_2.shape)
missing_nodes_2.head(1)

(63, 79)


Unnamed: 0,osmid,denue_preescolar_x,denue_primaria_x,denue_secundaria_x,clues_primer_nivel_x,denue_guarderias_x,denue_dif_x,denue_supermercado_x,denue_abarrotes_x,denue_carnicerias_x,...,denue_cines_y,denue_centro_cultural,denue_parque_natural_y,denue_papelerias,denue_libros,denue_revistas_periodicos_y,denue_ferreteria_tlapaleria_y,denue_art_limpieza_y,denue_pintura_y,denue_peluqueria_y
5783,1103032963,,,,,,,,,,...,,,,,,,,,,


In [30]:
# Find those nodes in nodes_test
# Result: all 63 missing nodes appear to have NaN values only.
missing_nodes_lst = list(missing_nodes_2.osmid.unique())
look = nodes_test.loc[nodes_test.osmid.isin(missing_nodes_lst)].copy()

#Show
print(look.shape)
look.head(1)

(63, 34)


Unnamed: 0,osmid,denue_preescolar,denue_primaria,denue_secundaria,clues_primer_nivel,denue_guarderias,denue_dif,denue_supermercado,denue_abarrotes,denue_carnicerias,...,denue_cafe,sip_cancha,sip_unidad_deportiva,sip_espacio_publico,denue_parque_natural,denue_cines,denue_museos,x,y,geometry
5783,1103032963,,,,,,,,,,...,,,,,,,,-102.33579,22.051572,POINT (-102.33579 22.05157)


In [34]:
# Find if all columns in nodes_test have exactly 63 nans (if true, find those nodes in GIS)
# Result: affirmative.
for amenity in amenities_list:
    nans = nodes_test.loc[nodes_test[amenity].isna()]
    print(f"{amenity} has {nans.shape[0]} nans.")

denue_preescolar has 63 nans.
denue_primaria has 63 nans.
denue_secundaria has 63 nans.
clues_primer_nivel has 63 nans.
denue_guarderias has 63 nans.
denue_dif has 63 nans.
denue_supermercado has 63 nans.
denue_abarrotes has 63 nans.
denue_carnicerias has 63 nans.
sip_mercado has 63 nans.
denue_peluqueria has 63 nans.
denue_farmacias has 63 nans.
denue_ferreteria_tlapaleria has 63 nans.
denue_art_limpieza has 63 nans.
denue_ropa has 63 nans.
denue_calzado has 63 nans.
denue_muebles has 63 nans.
denue_lavanderia has 63 nans.
denue_revistas_periodicos has 63 nans.
denue_pintura has 63 nans.
denue_restaurante_insitu has 63 nans.
denue_restaurante_llevar has 63 nans.
denue_bares has 63 nans.
denue_cafe has 63 nans.
sip_cancha has 63 nans.
sip_unidad_deportiva has 63 nans.
sip_espacio_publico has 63 nans.
denue_parque_natural has 51434 nans.
denue_cines has 63 nans.
denue_museos has 63 nans.


In [36]:
# Analyse situation in QGIS
look.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_scriptv2_nan_nodes.gpkg", driver='GPKG')

### Data to analyse nodes difference situation in QGIS

In [2]:
city = 'Aguascalientes'
metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'

# Download area of interest
query = f"SELECT * FROM {metro_schema}.{metro_table} WHERE \"city\" LIKE \'{city}\'"
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')
mun_gdf = mun_gdf.set_crs("EPSG:4326")
aoi = mun_gdf.dissolve()

# Download Network used to calculate nearest note to each poi
network_schema = 'osmnx'
edges_table = 'edges_speed'
nodes_table = 'nodes'
G, nodes, edges = aup.graph_from_hippo(aoi, schema=network_schema, edges_folder=edges_table, nodes_folder=nodes_table)

In [3]:
# Analyse situation in QGIS
nodes.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_nodes_2020.gpkg", driver='GPKG')
edges.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_edges_2020.gpkg", driver='GPKG')