# 13c_encuestas Merge physical variables

This notebook takes __rutas_edges.gpkg__ (a layer created manually in GIS by selecting (For each poll whose route was manually drawn) the edges from edges_var_gdl were the route passed) and identifies the original edges from edges_var_gdl.gpkg in order to merge in the output Poll code and variables data for each edge.

* __Output: rutas_edges_var.gpkg__ A layer containing poll code (e.g. JC1V008) and variables data for each edge.

## Import libraries

In [1]:
first_folder_path = '../../../'

In [2]:
import warnings
warnings.filterwarnings('ignore')
import geopandas as gpd
from geopandas.tools import overlay
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns
import random
import scipy.stats as stats

import os
import sys
module_path = os.path.abspath(os.path.join(first_folder_path))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup
else:
    import aup

## Load physical variables and create unique edge_id [Canceled]

In [5]:
def create_unique_edge_id(edges, order='uvkey'):
    """
    Create a unique edge_id based on the 'u', 'v' and 'key' columns of the edges GeoDataFrame.

    Args:
        edges (geopandas.GeoDataFrame): GeoDataFrame containing the edges of the network.
        order (str, optional): Order for the unique id. Defaults to 'uvkey'.

    Returns:
        geopandas.GeoDataFrame: GeoDataFrame with the unique edge_id column.
    """
    # Turn ID data to string
    edges['u'] = edges['u'].astype('str')
    edges['v'] = edges['v'].astype('str')
    edges['key'] = edges['key'].astype('str')
    # Concatenate ID data to create unique edge_id
    if order == 'uvkey':
        edges['edge_id'] = edges['u']+edges['v']+edges['key']
    elif order == 'vukey':
        edges['edge_id'] = edges['v']+edges['u']+edges['key']
    # Turn ID data back to int
    edges['u'] = edges['u'].astype('int')
    edges['v'] = edges['v'].astype('int')
    edges['key'] = edges['key'].astype('int')

    return edges

In [6]:
edges_var_gdl = create_unique_edge_id(edges_var_gdl)

# Show
print(edges_var_gdl.shape)
edges_var_gdl.head(2)

(197439, 22)


Unnamed: 0,u,v,key,grade_abs,ndvi_mean,entropy,sidewalk_available,no_sidewalk_available,banqueta_pct,pobtot_u,...,average_population,dens_u,dens_v,average_density,interdens_u,interdens_v,average_intersdens,geometry,unique_edge_id,edge_id
0,67637870229114485,67640019229114922,0,0.0,0.118923,1.144632,179.791185,69.176583,0.722146,80.0,...,53.5,153.566318,111.104363,132.335341,19.498764,19.071526,19.285145,"MULTILINESTRING ((676378.709 2291144.858, 6763...",0 67637870229114485\n1 6764001...,67637870229114485676400192291149220
1,67640019229114922,67640526229117504,0,0.0,0.188798,0.31737,181.014513,68.547498,0.725329,27.0,...,16.0,111.104363,31.101049,71.102706,19.071526,19.041632,19.056579,"MULTILINESTRING ((676400.196 2291149.223, 6764...",0 67637870229114485\n1 6764001...,67640019229114922676405262291175040


In [7]:
#edges_var_gdl.to_file(first_folder_path + f"data/processed/volvo/edges_var_gdl/edges_var_gdl_id.gpkg")

## Merge physical variables (edges) with edges that have route codes (Manually selected following coincidence with manually drawn routes)

### __Load data__ - Physical variables

In [3]:
edges_var_gdl = gpd.read_file(first_folder_path + f"data/processed/volvo/edges_var_gdl/edges_var_gdl.gpkg")

# Show
print(edges_var_gdl.crs)
print(edges_var_gdl.shape)
edges_var_gdl.head(2)

EPSG:32613
(197439, 20)


Unnamed: 0,u,v,key,grade_abs,ndvi_mean,entropy,sidewalk_available,no_sidewalk_available,banqueta_pct,pobtot_u,pobtot_v,total_pobtot,average_population,dens_u,dens_v,average_density,interdens_u,interdens_v,average_intersdens,geometry
0,67637870229114485,67640019229114922,0,0.0,0.118923,1.144632,179.791185,69.176583,0.722146,80.0,27.0,107.0,53.5,153.566318,111.104363,132.335341,19.498764,19.071526,19.285145,"MULTILINESTRING ((676378.709 2291144.858, 6763..."
1,67640019229114922,67640526229117504,0,0.0,0.188798,0.31737,181.014513,68.547498,0.725329,27.0,5.0,32.0,16.0,111.104363,31.101049,71.102706,19.071526,19.041632,19.056579,"MULTILINESTRING ((676400.196 2291149.223, 6764..."


### __Load data__ - Routes with codes

In [4]:
rutas_edges = gpd.read_file(first_folder_path + f"data/processed/volvo/encuestas/03_rutas_en_edges/rutas_edges.gpkg")
rutas_edges.drop(columns=['unique_edge_id','has_id'],inplace=True) #Added in GIS, ignore if gpkg does not have those cols

# Show
print(rutas_edges.crs)
print(rutas_edges.shape)
rutas_edges.head(2)

EPSG:4326
(1865, 2)


Unnamed: 0,code,geometry
0,JC1V008,"LINESTRING (-103.38511 20.68946, -103.38498 20..."
1,JC1V008,"LINESTRING (-103.38498 20.69015, -103.38489 20..."


### Find each edge in rutas_edges in original edges_var_gdl

In [5]:
# Change rutas's crs to match edges_var_gdl
rutas_edges_32613 = rutas_edges.copy()
rutas_edges_32613.to_crs("EPSG:32613",inplace=True)

# Reset ruta to have ID
rutas_edges_32613.reset_index(inplace=True,drop=True)
rutas_edges_32613.reset_index(inplace=True)
rutas_edges_32613.rename(columns={'index':'ruta_idx'},inplace=True)

# Show
print(rutas_edges_32613.crs)
print(rutas_edges_32613.shape)
rutas_edges_32613.head(2)

EPSG:32613
(1865, 3)


Unnamed: 0,ruta_idx,code,geometry
0,0,JC1V008,"LINESTRING (668194.953 2288617.514, 668207.517..."
1,1,JC1V008,"LINESTRING (668207.517 2288693.987, 668216.476..."


In [6]:
# Create a small buffer for each ruta
rutas_buff = rutas_edges_32613.buffer(0.1)
# Convert into DataFrame and reset index again to have unique identifier that matches rutas_edges_32613
rutas_buff = pd.DataFrame(rutas_buff)
rutas_buff.reset_index(inplace=True)
# Convert into GeoDataFrame#
rutas_buff.rename(columns={0:'geometry'},inplace=True)
rutas_buff_gdf = gpd.GeoDataFrame(rutas_buff,geometry='geometry',crs='EPSG:32613')
# Rename unique identifier
rutas_buff_gdf.rename(columns={'index':'ruta_idx'},inplace=True)

# Show
print(rutas_buff_gdf.crs)
print(rutas_buff_gdf.shape)
rutas_buff_gdf.head(2)

EPSG:32613
(1865, 2)


Unnamed: 0,ruta_idx,geometry
0,0,"POLYGON ((668207.418 2288694.003, 668207.420 2..."
1,1,"POLYGON ((668216.377 2288748.475, 668216.379 2..."


In [8]:
# Reset original edges from edges_var_gdl to have traceable ID
org_edges = edges_var_gdl.copy()
org_edges.reset_index(inplace=True,drop=True)
org_edges.reset_index(inplace=True)
# Rename unique identifier
org_edges.rename(columns={'index':'org_idx'},inplace=True)

# Create mid points for each original edge
org_edges['mid_point'] = org_edges.interpolate(org_edges.length / 2)
# Assign mid_point to its own gdf and drop column 'mid_point' from org_edges
org_mid_points = org_edges[['org_idx','mid_point']].copy()
org_mid_points.rename(columns={'mid_point':'geometry'},inplace=True)
org_edges.drop(columns=['mid_point'],inplace=True)

# Show
print(org_mid_points.crs)
print(org_mid_points.shape)
org_mid_points.head(2)

EPSG:32613
(197439, 2)


Unnamed: 0,org_idx,geometry
0,0,POINT (676389.972 2291144.488)
1,1,POINT (676407.108 2291161.274)


In [11]:
# Intersect buffer and centroids
join_gdf = gpd.sjoin(org_mid_points, rutas_buff_gdf, how='inner', predicate='within')
join_gdf = join_gdf[['ruta_idx','org_idx']]

# Show
print(join_gdf.shape)
join_gdf.head(2)

(1861, 2)


Unnamed: 0,ruta_idx,org_idx
977,403,977
978,404,978


In [35]:
# There should be 0 duplicated ruta_idx. Else, indicates that one ruta edge found multiple original edges.
print(len(join_gdf.loc[join_gdf.duplicated("ruta_idx")]))

0


In [14]:
org_edges.columns

Index(['org_idx', 'u', 'v', 'key', 'grade_abs', 'ndvi_mean', 'entropy',
       'sidewalk_available', 'no_sidewalk_available', 'banqueta_pct',
       'pobtot_u', 'pobtot_v', 'total_pobtot', 'average_population', 'dens_u',
       'dens_v', 'average_density', 'interdens_u', 'interdens_v',
       'average_intersdens', 'geometry'],
      dtype='object')

In [18]:
# Find routes and assign original edge's data
found_routes = list(join_gdf.ruta_idx.unique())

for found_route in found_routes:
    # Original edge found
    corresponding_org_idx = join_gdf.loc[join_gdf.ruta_idx==found_route]['org_idx'].unique()[0]
    # Original edge's data
    idx_org = org_edges['org_idx']==corresponding_org_idx
    # Transfer data to rutas_edges_32613
    idx = rutas_edges_32613['ruta_idx']==found_route
    rutas_edges_32613.loc[idx,'u'] = str(org_edges.loc[idx_org,'u'].unique()[0])
    rutas_edges_32613.loc[idx,'v'] = str(org_edges.loc[idx_org,'v'].unique()[0])
    rutas_edges_32613.loc[idx,'key'] = str(org_edges.loc[idx_org,'key'].unique()[0])

# Show
print(rutas_edges_32613.shape)
rutas_edges_32613.head(2)

(1865, 6)


Unnamed: 0,ruta_idx,code,geometry,u,v,key
0,0,JC1V008,"LINESTRING (668194.953 2288617.514, 668207.517...",66819495228861751,66820751228869398,0
1,1,JC1V008,"LINESTRING (668207.517 2288693.987, 668216.476...",66820751228869398,66821647228874845,0


In [19]:
# Missing data (GIS)
rutas_edges_32613.loc[rutas_edges_32613.u.isna()]

Unnamed: 0,ruta_idx,code,geometry,u,v,key
382,382,DS2M025,"LINESTRING (665063.333 2288910.066, 665062.305...",,,
383,383,DS2M025,"LINESTRING (665067.072 2288910.147, 665063.333...",,,
387,387,DS2M015,"LINESTRING (665063.333 2288910.066, 665062.305...",,,
388,388,DS2M015,"LINESTRING (665067.072 2288910.147, 665063.333...",,,


In [21]:
# Fixed missing data in GIS
#rutas_edges_32613.to_file(first_folder_path + f"data/processed/volvo/encuestas/03_rutas_en_edges/rutas_edges_id_1.gpkg")

# Read fixed file
rutas_edges_fixed = gpd.read_file(first_folder_path + f"data/processed/volvo/encuestas/03_rutas_en_edges/rutas_edges_id_2.gpkg")

# Show
print(rutas_edges_fixed.crs)
print(rutas_edges_fixed.shape)
rutas_edges_fixed.head(2)

EPSG:32613
(1865, 6)


Unnamed: 0,ruta_idx,code,u,v,key,geometry
0,0,JC1V008,66819495228861751,66820751228869398,0,"LINESTRING (668194.953 2288617.514, 668207.517..."
1,1,JC1V008,66820751228869398,66821647228874845,0,"LINESTRING (668207.517 2288693.987, 668216.476..."


In [23]:
rutas_edges_fixed.loc[rutas_edges_fixed.u.isna()]

Unnamed: 0,ruta_idx,code,u,v,key,geometry


In [31]:
# Change to int for compatibility
rutas_edges_fixed['u'] = rutas_edges_fixed['u'].astype('int')
rutas_edges_fixed['v'] = rutas_edges_fixed['v'].astype('int')
rutas_edges_fixed['key'] = rutas_edges_fixed['key'].astype('int')
# Merge physical variables
rutas_edges_var = pd.merge(rutas_edges_fixed[['code','u','v','key']],edges_var_gdl,on=['u','v','key'])

# Turn into GeoDataFrame
rutas_edges_var = gpd.GeoDataFrame(rutas_edges_var,geometry='geometry',crs="EPSG:32613")

# Show
print(rutas_edges_var.crs)
print(rutas_edges_var.shape)
rutas_edges_var.head(2)                      

EPSG:32613
(1861, 21)


Unnamed: 0,code,u,v,key,grade_abs,ndvi_mean,entropy,sidewalk_available,no_sidewalk_available,banqueta_pct,...,pobtot_v,total_pobtot,average_population,dens_u,dens_v,average_density,interdens_u,interdens_v,average_intersdens,geometry
0,JC1V008,66819495228861751,66820751228869398,0,0.012849,0.15356,1.726807,316.602353,0.0,1.0,...,70.0,172.0,86.0,65.5488,50.632364,58.090582,12.761922,12.153251,12.457586,"MULTILINESTRING ((668194.953 2288617.514, 6682..."
1,MeV1V006,66819495228861751,66820751228869398,0,0.012849,0.15356,1.726807,316.602353,0.0,1.0,...,70.0,172.0,86.0,65.5488,50.632364,58.090582,12.761922,12.153251,12.457586,"MULTILINESTRING ((668194.953 2288617.514, 6682..."


In [36]:
rutas_edges_var.to_file(first_folder_path + f"data/processed/volvo/encuestas/03_rutas_en_edges/rutas_edges_var.gpkg")