# tests_01 - Script 21 nodes output comparison

This notebook compares times by osmid between the original proxanalysis located in data base (nodes_proximity_2020) and the Script 21 output for Aguascalientes (local save=True, test mode).

__Results:__
* There was a small difference in times. After analysing that small difference node by node, it turned out to be due to very very very (e-13) small decimals. Therefore, __the result was deemed as successful.__
  
* Showed differences in the amount of nodes. Tests indicate that the source of the problem is that function aup.graph_from_hippo can generate disconected networks (e.g. two or more different networks for an area of interest). This was solved by creating a function (filter_city_osmnx_network) and inserting it onto Script 09-hex_speed.py so that aup.graph_from_hippo recieves an already filtered edges_speed for each city. __[Applies to Version 2 (edges_speed_23_line) only, so not visible in current version 1 comparison]__

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Load data

### Load data - Load original data (Already in database)

In [2]:
# Version 1 original (Script 01 + 02 + 15)
city = 'Aguascalientes'
schema = 'prox_analysis'
table = 'nodes_proximity_2020'
query = f"SELECT * FROM {schema}.{table} WHERE \"metropolis\" LIKE \'{city}\'"
nodes_gdf = aup.gdf_from_query(query, geometry_col='geometry')

# Show
print(nodes_gdf.shape)
nodes_gdf.head(1)

(2208953, 7)


Unnamed: 0,osmid,x,y,geometry,time,amenity,metropolis
0,272921360,-102.295073,21.872876,POINT (-102.29507 21.87288),10.770629,denue_preescolar,Aguascalientes


In [3]:
# Transform original nodes data (rows to columns)
nodes_analysis = nodes_gdf.drop_duplicates(subset='osmid', keep="last")[['osmid','geometry','metropolis']].copy()
for amenidad in list(nodes_gdf.amenity.unique()):
    nodes_tmp = nodes_gdf.loc[nodes_gdf.amenity == amenidad,['osmid','time']]
    nodes_tmp = nodes_tmp.rename(columns={'time':amenidad})

    if nodes_tmp[amenidad].mean() == 0:
        nodes_tmp[amenidad] = np.nan

    nodes_analysis = nodes_analysis.merge(nodes_tmp, on='osmid')

# Show
print(nodes_analysis.shape)
nodes_analysis.head(1)

(51371, 46)


Unnamed: 0,osmid,geometry,metropolis,denue_preescolar,denue_primaria,denue_secundaria,denue_escuela_mixta,denue_casa_adultos_mayores,denue_guarderias,denue_dif,...,sip_cancha,sip_unidad_deportiva,sip_espacio_publico,sip_mercado,clues_primer_nivel,clues_segundo_nivel,clues_tercer_nivel,sigade_preescolar,sigade_primaria,sigade_secundaria
0,272921360,POINT (-102.29507 21.87288),Aguascalientes,10.770629,5.615674,11.812107,5.25473,9.729402,5.09561,6.486561,...,156.885198,28.44305,5.692959,22.145997,6.486561,6.486561,53.636174,3.327789,3.327789,3.327789


In [4]:
# There are no parques naturales in Aguascalientes
nodes_analysis.denue_parque_natural.unique()

array([nan])

### Load data - Load prox script 21 data (Aguascalientes, version 1)

In [18]:
# In order to compare data with nodes_proximity_2020, Script 21 should be run with the following configuration:
config = """
version = 1

# Databases and schemas
nodes_table = 'nodes'
edges_table = 'edges_speed'
denue_schema = 'denue'
denue_table = 'denue_2020'
clues_schema = 'denue'
clues_table = 'clues'
sip_schema = 'denue'
sip_table = 'sip_2020'

# Network distance method used in function pois_time.
prox_measure = 'time_min'

# Count available amenities at given time proximity (minutes)?
count_pois = (False,15)

# Run with pop_output, but since pop output is not passed to nodes,
# That's only comparable on Notebook test_02 - Script 21 hex output comparison
pop_output = True

# Hexagon resolutions of output
res_list = [8]

# Stop at any given point of script's main function?
stop = False

# Saving
nodes_save = False
final_save = False 
local_save = True
"""

In [6]:
# test
dir = "../../../data/processed/proximity_v2/test_ags_proxanalysis_scriptv1_nodes.gpkg"
nodes_test = gpd.read_file(dir)

# Show
print(nodes_test.shape)
nodes_test.head(1)

(51434, 34)


Unnamed: 0,osmid,denue_preescolar,denue_primaria,denue_secundaria,clues_primer_nivel,denue_guarderias,denue_dif,denue_supermercado,denue_abarrotes,denue_carnicerias,...,denue_cafe,sip_cancha,sip_unidad_deportiva,sip_espacio_publico,denue_parque_natural,denue_cines,denue_museos,x,y,geometry
0,272921360,10.770629,5.615674,11.812107,6.486561,5.09561,6.486561,3.592175,1.673421,6.486561,...,5.25473,156.885198,28.44305,5.692959,,26.069111,7.589368,-102.295073,21.872876,POINT (-102.29507 21.87288)


## Compare data

### Proximity data cols to compare

In [8]:
# Create amenities list which will be compared by node
amenities_list = list(nodes_test.columns)
amenities_list.remove('osmid')
amenities_list.remove('x')
amenities_list.remove('y')
amenities_list.remove('geometry')
amenities_list

['denue_preescolar',
 'denue_primaria',
 'denue_secundaria',
 'clues_primer_nivel',
 'denue_guarderias',
 'denue_dif',
 'denue_supermercado',
 'denue_abarrotes',
 'denue_carnicerias',
 'sip_mercado',
 'denue_peluqueria',
 'denue_farmacias',
 'denue_ferreteria_tlapaleria',
 'denue_art_limpieza',
 'denue_ropa',
 'denue_calzado',
 'denue_muebles',
 'denue_lavanderia',
 'denue_revistas_periodicos',
 'denue_pintura',
 'denue_restaurante_insitu',
 'denue_restaurante_llevar',
 'denue_bares',
 'denue_cafe',
 'sip_cancha',
 'sip_unidad_deportiva',
 'sip_espacio_publico',
 'denue_parque_natural',
 'denue_cines',
 'denue_museos']

### Compare

In [9]:
# Inner merge (to compare same osmids)
compare = nodes_analysis.merge(nodes_test,on='osmid',how='inner')

# Compare old and new amenities
compare_list = []
for amenity in amenities_list:

    old_amenity = f"{amenity}_x"
    new_amenity = f"{amenity}_y"
    compare[f"{amenity}_diff"] = compare[new_amenity] - compare[old_amenity]

    compare_list.append(f"{amenity}_diff")

# Save df with time differences only
compare_diff = compare[compare_list]

# Visualize sum of time differences
summary = pd.DataFrame()
i = 0
for compare_amenity in compare_list:

    diff_value = compare[compare_amenity].sum()
    summary.loc[i,'amenity'] = compare_amenity
    summary.loc[i,'diff'] = diff_value

    i = i+1

# Show
# denue_parque_natural_diff is 0 because there are no parques naturales in Aguascalientes
summary

Unnamed: 0,amenity,diff
0,denue_preescolar_diff,-7.665077e-12
1,denue_primaria_diff,-1.437461e-12
2,denue_secundaria_diff,1.616421e-11
3,clues_primer_nivel_diff,-6.788167e-12
4,denue_guarderias_diff,-6.542461e-12
5,denue_dif_diff,-1.041363e-11
6,denue_supermercado_diff,-2.764117e-11
7,denue_abarrotes_diff,-8.497182e-12
8,denue_carnicerias_diff,-2.564849e-11
9,sip_mercado_diff,-1.812901e-11


## Solve problems [Solved]

In [29]:
# If want to check how problems were solved, turn to True
run_solved_problems = False

### Difference in times on previous comparison [Solved]

Problem: The diff is not 0.
Result: the difference in times is due to small decimals (e-13 mins at least) in each node.

Approach: The following cell counts the number of nodes in each difference range to analyze in order to know if the difference is due to very small decimals in all nodes or due to big differences that balance themselves.

In [20]:
if run_solved_problems:
    # How much is the difference between old and new time by node?
    differences_to_analyse = [1,0.000000000001,0.0000000000001,0.00000000000001,0.000000000000001]
    
    nodes_diff_df = pd.DataFrame()
    
    for time in differences_to_analyse:
        i = 0
        for diff_col in compare_list:
            nodes_diff_df.loc[i,'amenity'] = diff_col
            
            nodes_with_positive_diff = compare.loc[(compare[diff_col]>time)].shape[0]
            nodes_with_negative_diff = compare.loc[(compare[diff_col]<-time)].shape[0]
    
            nodes_diff_df.loc[i,f"{time}mins_diff"] = nodes_with_positive_diff
            nodes_diff_df.loc[i,f"{-time}mins_diff"] = nodes_with_negative_diff
    
            i = i+1
    
    # There are 51,371 nodes
    nodes_diff_df

### Difference in amount of nodes on both gdfs [Solved]

Problem:
* nodes_analysis (original, from db) had __51,371__ unique nodes for Aguascalientes
* nodes_test (Script 21) had more, __51,434__ unique nodes.

Result: Solved on Script 09-hex_speed.py It shouldn't be the case anymore.                                        

In [21]:
if run_solved_problems:
    # FIND NODES ON NODES ANALYSIS THAT ARE NOT IN NODES_TEST (Result: 0 missing nodes)
    # Find how osmids relate (through indicator)
    missing_nodes_1 = nodes_analysis.merge(nodes_test, on='osmid', how='left', indicator=True)
    # Select nodes which are in left_gdf only ('left_only')
    missing_nodes_1 = missing_nodes_1.loc[missing_nodes_1['_merge']=='left_only']
    missing_nodes_1 = missing_nodes_1.drop(columns=['_merge'])
    missing_nodes_1

In [22]:
if run_solved_problems:
    # FIND NODES ON NODES_TEST ANALYSIS THAT ARE NOT IN NODES ANALYSIS (Result: 63 missing nodes)
    # Find how osmids relate (through indicator)
    missing_nodes_2 = nodes_test.merge(nodes_analysis, on='osmid', how='left', indicator=True)
    # Select nodes which are in left_gdf only ('left_only')
    missing_nodes_2 = missing_nodes_2.loc[missing_nodes_2['_merge']=='left_only']
    missing_nodes_2 = missing_nodes_2.drop(columns=['_merge'])
    
    #Show
    print(missing_nodes_2.shape)
    missing_nodes_2.head(1)

In [23]:
if run_solved_problems:
    #Find those nodes in nodes_test (Result: all 63 missing nodes appear to have NaN values only.)
    missing_nodes_lst = list(missing_nodes_2.osmid.unique())
    look = nodes_test.loc[nodes_test.osmid.isin(missing_nodes_lst)].copy()
    
    #Show
    print(look.shape)
    look.head(1)

In [24]:
if run_solved_problems:
    # Find if all columns in nodes_test have exactly 63 nans (if true, find those nodes in GIS)
    # (Result: affirmative)
    for amenity in amenities_list:
        nans = nodes_test.loc[nodes_test[amenity].isna()]
        print(f"{amenity} has {nans.shape[0]} nans.")

In [25]:
if run_solved_problems:
    # Analyse situation in QGIS
    look.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_scriptv2_nan_nodes.gpkg", driver='GPKG')

#### First solution: Fixing nan rows issue by removing nodes with sum of times = 0

In [26]:
# This fix was proposed as a quickfix, but later fix on Script 09-hex_speed.py was implemented. 
if run_solved_problems:
    print(nodes_test.shape)
    nodes_fixed = nodes_test.copy()
    nodes_fixed['check'] = nodes_fixed[amenities_list].sum(axis=1)
    nodes_fixed = nodes_fixed.loc[nodes_fixed['check'] > 0].copy()
    nodes_fixed.drop(columns=['check'],inplace=True)
    
    # Show
    print(nodes_fixed.shape)
    print(f"Deleted {nodes_test.shape[0] - nodes_fixed.shape[0]} rows with nan values only.")
    nodes_fixed.head(1)

#### Second solution: Analyse nodes difference situation in QGIS and solve [Implemented]

In [28]:
# From this analysis emerged solution on Script 09-hex_speed.py 
if run_solved_problems:
    city = 'Aguascalientes'
    metro_schema = 'metropolis'
    metro_table = 'metro_gdf_2020'
    
    # Download area of interest
    query = f"SELECT * FROM {metro_schema}.{metro_table} WHERE \"city\" LIKE \'{city}\'"
    mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')
    mun_gdf = mun_gdf.set_crs("EPSG:4326")
    aoi = mun_gdf.dissolve()
    
    # Download Network used to calculate nearest note to each poi
    network_schema = 'osmnx'
    edges_table = 'edges_speed'
    nodes_table = 'nodes'
    G, nodes, edges = aup.graph_from_hippo(aoi, schema=network_schema, edges_folder=edges_table, nodes_folder=nodes_table)

    # Analyse situation in QGIS
    nodes.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_nodes_2020.gpkg", driver='GPKG')
    edges.to_file("../../../data/external/temporal_fromjupyter/proximity_v2/ags_edges_2020.gpkg", driver='GPKG')