# tests_02 - Script 21 hexs output comparison

This notebook compares times by hex res 8 between the original proxanalysis located in data base (time_15_min_analysis_hexres8) and the Script 21 output for Aguascalientes (local save=True, test mode).

__Results:__

* There was a small difference in times. After analysing that small difference node by node, it turned out to be due to very very very (e-13) small decimals. Therefore, __the result was deemed as successful.__

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Load data

### Load data - Load original data (Already in database)

In [16]:
# Version 1 original (Script 01 + 02 + 15)
city = 'Aguascalientes'
prox_schema = 'prox_analysis'
prox_table = 'time_15_min_analysis_hexres8'

query = f"SELECT * FROM {prox_schema}.{prox_table} WHERE \"city\" LIKE \'{city}\'"
v1_original_prox_gdf = aup.gdf_from_query(query, geometry_col='geometry')

v1_original_prox_gdf.rename(columns={'hex_id_8':'hex_id',
                                     'max_idx_15_min':'max_time',
                                     'dens_pobha':'dens_pob_ha'},inplace=True)

# Show
print(v1_original_prox_gdf.shape)
v1_original_prox_gdf.head(1)

(309, 30)


Unnamed: 0,hex_id,geometry,max_escuelas,max_preescolar,max_primaria,max_secundaria,max_servicios comunitarios,max_salud,max_guarderías,max_asistencia social,...,max_time,pobtot,pobfem,pobmas,pob_0a14,pob_15a24,pob_25a59,p_60ymas,dens_pob_ha,city
0,88498e36dbfffff,"POLYGON ((-102.34777 21.92336, -102.34267 21.9...",15.057619,13.586498,15.057619,11.969535,62.167866,53.640209,50.912589,62.167866,...,71.010665,264.7125,131.68126,133.03125,75.9375,50.9625,117.731249,20.081251,3.063433,Aguascalientes


### Load data - Load prox script 21 data (Aguascalientes, version 1)

In [3]:
# In order to compare data with nodes_proximity_2020, Script 21 should be run with the following configuration:
config = """
version = 1

# Databases and schemas
nodes_table = 'nodes'
edges_table = 'edges_speed'
denue_schema = 'denue'
denue_table = 'denue_2020'
clues_schema = 'denue'
clues_table = 'clues'
sip_schema = 'denue'
sip_table = 'sip_2020'

# Network distance method used in function pois_time.
prox_measure = 'time_min'

# Count available amenities at given time proximity (minutes)?
count_pois = (False,15)

# Run with pop_output, but since pop output is not passed to nodes,
# That's only comparable on Notebook test_02 - Script 21 hex output comparison
pop_output = True

# Hexagon resolutions of output
res_list = [8]

# Stop at any given point of script's main function?
stop = False

# Saving
nodes_save = False
final_save = False 
local_save = True
"""

In [5]:
# test
dir = "../../../data/processed/proximity_v2/test_ags_proxanalysis_scriptv1_hex.gpkg"
hex_test = gpd.read_file(dir)

# Show
print(hex_test.shape)
hex_test.head(1)

(309, 42)


Unnamed: 0,hex_id,res,max_escuelas,max_preescolar,max_primaria,max_secundaria,max_servicios comunitarios,max_salud,max_guarderías,max_asistencia social,...,idx_actividad física,idx_cultural,mean_time,median_time,max_time,idx_sum,pobtot,dens_pob_ha,city,geometry
0,88498e36dbfffff,8,15.057619,13.586498,15.057619,11.969535,62.167866,53.640209,50.912589,62.167866,...,0.854075,0.002455,28.977914,17.066295,71.010665,8.47822,264.7125,3.063433,Aguascalientes,"POLYGON ((-102.34777 21.92336, -102.34267 21.9..."


## Compare data

### Data cols to compare

In [23]:
# Create col list which will be compared by hex
col_list = list(v1_original_prox_gdf.columns)
col_list.remove('hex_id')
col_list.remove('geometry')
col_list.remove('pobfem')
col_list.remove('pobmas')
col_list.remove('pob_0a14')
col_list.remove('pob_15a24')
col_list.remove('pob_25a59')
col_list.remove('p_60ymas')
col_list.remove('city')
col_list

['max_escuelas',
 'max_preescolar',
 'max_primaria',
 'max_secundaria',
 'max_servicios comunitarios',
 'max_salud',
 'max_guarderías',
 'max_asistencia social',
 'max_comercio',
 'max_alimentos',
 'max_personal',
 'max_farmacias',
 'max_hogar',
 'max_complementarios',
 'max_entretenimiento',
 'max_social',
 'max_actividad física',
 'max_cultural',
 'max_time',
 'pobtot',
 'dens_pob_ha']

In [24]:
# Inner merge (to compare same hexs)
compare = v1_original_prox_gdf.merge(hex_test,on='hex_id',how='inner')

# Compare old and new amenities
compare_list = []
for col in col_list:

    old_col = f"{col}_x"
    new_col = f"{col}_y"
    compare[f"{col}_diff"] = compare[new_col] - compare[old_col]

    compare_list.append(f"{col}_diff")

# Save df with time differences only
compare_diff = compare[compare_list]

# Visualize sum of time differences
summary = pd.DataFrame()
i = 0
for compare_col in compare_list:

    diff_value = compare[compare_col].sum()
    summary.loc[i,'amenity'] = compare_col
    summary.loc[i,'diff'] = diff_value

    i = i+1

# Show
# denue_parque_natural_diff is 0 because there are no parques naturales in Aguascalientes
summary

Unnamed: 0,amenity,diff
0,max_escuelas_diff,-3.552714e-15
1,max_preescolar_diff,-8.881784e-15
2,max_primaria_diff,1.776357e-15
3,max_secundaria_diff,-3.552714e-15
4,max_servicios comunitarios_diff,-1.776357e-15
5,max_salud_diff,7.105427e-15
6,max_guarderías_diff,0.0
7,max_asistencia social_diff,-8.881784e-15
8,max_comercio_diff,-5.329071e-15
9,max_alimentos_diff,1.776357e-15


## Solve problems [Solved]

### Difference in times on previous comparison [Solved]

__Problem:__ Same as the one detected on Notebook tests_01. The diff is not 0. 

__Result:__ the difference in times is due to small decimals (e-14 mins at least) in each node.

__Approach:__ The following cell counts the number of hexs in each difference range to analyze in order to know if the difference is due to very small decimals in all hexs or due to big differences that balance themselves.

In [27]:
# How much is the difference between old and new time by hex?
differences_to_analyse = [1,0.000000000001,0.0000000000001,0.00000000000001,0.000000000000001]

hexs_diff_df = pd.DataFrame()

for time in differences_to_analyse:
    i = 0
    for diff_col in compare_list:
        hexs_diff_df.loc[i,'amenity'] = diff_col
        
        hexs_with_positive_diff = compare.loc[(compare[diff_col]>time)].shape[0]
        hexs_with_negative_diff = compare.loc[(compare[diff_col]<-time)].shape[0]

        hexs_diff_df.loc[i,f"{time}mins_diff"] = hexs_with_positive_diff
        hexs_diff_df.loc[i,f"{-time}mins_diff"] = hexs_with_negative_diff

        i = i+1

hexs_diff_df

Unnamed: 0,amenity,1mins_diff,-1mins_diff,1e-12mins_diff,-1e-12mins_diff,1e-13mins_diff,-1e-13mins_diff,1e-14mins_diff,-1e-14mins_diff,1e-15mins_diff,-1e-15mins_diff
0,max_escuelas_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,max_preescolar_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
2,max_primaria_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,max_secundaria_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,max_servicios comunitarios_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,max_salud_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,max_guarderías_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7,max_asistencia social_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0
8,max_comercio_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
9,max_alimentos_diff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
