In [1]:
import warnings
warnings.filterwarnings('ignore')
import geopandas as gpd
from geopandas.tools import overlay
import matplotlib.pyplot as plt
import os 
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
import xgboost as xgb
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, mean_squared_error
#import tensorflow as tf
#from keras.models import Sequential
#from keras.layers import Dense
#from tensorflow.keras.utils import to_categorical

## Espacios Públicos GDL

In [2]:
schema = 'espacios_publicos'
table = 'ep_amg'
gdf_gdl = aup.gdf_from_db(table,schema)

In [10]:
gdf_gdl.shape

(8499, 4)

## Espacios Públicos QRO

In [3]:
schema = 'espacios_publicos'
table = 'ep_qro'
gdf_qro = aup.gdf_from_db(table,schema)

In [11]:
gdf_qro.shape

(4814, 9)

## Espacios Públicos CDMX

In [4]:
schema = 'espacios_publicos'
table = 'ep_cdmx'
gdf_cdmx = aup.gdf_from_db(table,schema)

In [12]:
gdf_cdmx.shape

(11609, 13)

## Juntamos en un dataset

In [5]:
gdf_gdl = gdf_gdl.set_crs("EPSG:4326", allow_override=True)
gdf_qro = gdf_qro.set_crs("EPSG:4326", allow_override=True)
gdf_cdmx = gdf_cdmx.set_crs("EPSG:4326", allow_override=True)
print("CRS of gdf_gdl:", gdf_gdl.crs)
print("CRS of gdf_qro:", gdf_qro.crs)
print("CRS of gdf_cdmx:", gdf_cdmx.crs)

CRS of gdf_gdl: EPSG:4326
CRS of gdf_qro: EPSG:4326
CRS of gdf_cdmx: EPSG:4326


In [7]:
geodataframes = [gdf_gdl, gdf_qro, gdf_cdmx]
gdf = pd.concat(geodataframes)
gdf = gdf['geometry']
gdf.head()

0    POLYGON ((672507.02800 2272132.20130, 672520.1...
1    POLYGON ((672520.12560 2272170.35300, 672507.0...
2    POLYGON ((672494.32530 2272095.20510, 672480.3...
3    POLYGON ((672480.36630 2272060.73140, 672494.3...
4    POLYGON ((672468.71720 2272020.61270, 672456.8...
Name: geometry, dtype: geometry

In [8]:
gdf.shape

(24922,)

## Hexágonos 3 ciudades

In [9]:
hex_schema = "raster_analysis"
hex_table = "ndvi_analysis_hex"
c1 = 'Guadalajara'
c2 ='ZMVM'
c3 = 'Queretaro'
n = '11'
query = f'SELECT hex_id, geometry, city FROM {hex_schema}.{hex_table} WHERE \"city\" IN (\'{c1}\',\'{c2}\',\'{c3}\') AND \"res\" = {n}'
hex_gdl = aup.gdf_from_query(query, geometry_col='geometry')
#query = 'fSelect geometry, hex_id, city from raster_analysis.ndvi_analysis where city = guadalajara and res = 11'
hex_gdl.head(5)

Unnamed: 0,hex_id,geometry,city
0,8b49ab4b2758fff,"POLYGON ((-103.39247 20.75090, -103.39230 20.7...",Guadalajara
1,8b49ab4b270cfff,"POLYGON ((-103.39091 20.74832, -103.39074 20.7...",Guadalajara
2,8b49ab4b2792fff,"POLYGON ((-103.38489 20.74954, -103.38472 20.7...",Guadalajara
3,8b49ab4b2722fff,"POLYGON ((-103.39016 20.74679, -103.38998 20.7...",Guadalajara
4,8b49ab4b26f4fff,"POLYGON ((-103.38863 20.75304, -103.38845 20.7...",Guadalajara


In [29]:
hex_gdl.shape

(807141, 4)

In [24]:
hex_gdl['city'].unique()

array(['Guadalajara', 'Queretaro', 'ZMVM'], dtype=object)

In [18]:
contador_gdll = np.sum(hex_gdl['city'] == 'Guadalajara')
contador_cdmxx = np.sum(hex_gdl['city'] == 'ZMVM')
contador_qroo = np.sum(hex_gdl['city'] == 'Queretaro')
print(f'Número de hexágonos Gdl: {contador_gdll} \nNúmero de hexágonos Cdmx: {contador_cdmxx} \nNúmero de hexágonos Qro: {contador_qroo}')

Número de hexágonos Gdl: 407141 
Número de hexágonos Cdmx: 300000 
Número de hexágonos Qro: 100000


In [10]:
hex_gdl = hex_gdl.set_crs("EPSG:4326", allow_override=True)
print("CRS of gdf_gdl:", gdf_gdl.crs)

CRS of gdf_gdl: EPSG:4326


Como tenemos 800,000 hexágonos tenemos que reducir la dimensión a 400,000 porque mi compu no puede con tantos datos. Vamos a reducir la dimensión de las 3 ciudades a la mitad. Inicialmente, tenemos que Guadalajara tiene 400,000 hexágonos. Cdmx tiene 300,000 hexágonos y Querétaro tiene 100,000 hexágonos. Pero después de reducir las dimensiones serán 200,000; 150,000 y 50,000 respectivamente.

In [19]:
gdl_samples = contador_gdll/2
cdmx_samples = contador_cdmxx/2
qro_samples = contador_qroo/2

# Randomly sample 200,000 hexagons of Guadalajra
sample_with_Guadalajara = hex_gdl[hex_gdl['city'] == 'Guadalajara'].sample(n=gdl_samples, random_state=42)

# Randomly sample 150,000 hexagons of Cdmx
sample_with_Cdmx = hex_gdl[hex_gdl['city'] == 'ZMVM'].sample(n=cdmx_samples, random_state=42)

# Randomly sample 50,000 hexagons of Queretaro
sample_with_Queretaro = hex_gdl[hex_gdl['city'] == 'Queretaro'].sample(n=qro_samples, random_state=42)

# Concatenate the two samples to get the final subsample
hex_gdf = pd.concat([sample_with_Guadalajara, sample_with_Cdmx, sample_with_Queretaro])
hex_gdf.reset_index(drop = True, inplace = True)
hex_gdf.head()

Unnamed: 0,hex_id,geometry,city
0,8b49aa2f0943fff,"POLYGON ((-103.29497 20.46059, -103.29479 20.4...",Guadalajara
1,8b498c94cd48fff,"POLYGON ((-103.35049 20.70770, -103.35032 20.7...",Guadalajara
2,8b49ab594ae1fff,"POLYGON ((-103.41820 20.60143, -103.41803 20.6...",Guadalajara
3,8b49ab5b2511fff,"POLYGON ((-103.40008 20.57395, -103.39991 20.5...",Guadalajara
4,8b498c955971fff,"POLYGON ((-103.27315 20.67382, -103.27297 20.6...",Guadalajara


In [20]:
contador_gdl = np.sum(hex_gdf['city'] == 'Guadalajara')
contador_cdmx = np.sum(hex_gdf['city'] == 'ZMVM')
contador_qro = np.sum(hex_gdf['city'] == 'Queretaro')
print(f'Número de hexágonos Gdl: {contador_gdl} \nNúmero de hexágonos Cdmx: {contador_cdmx} \nNúmero de hexágonos Qro: {contador_qro}')

Número de hexágonos Gdl: 200000 
Número de hexágonos Cdmx: 150000 
Número de hexágonos Qro: 50000


## Dummy Variables
1 = Hexágonos que cuentan con espacios públicos

0 = Hexágonos que no cuentan con espacios públicos

In [21]:
gdf_sindex = gdf.sindex

# Create a Series of boolean values indicating which hexagons intersect with public spaces
intersects_public_spaces = hex_gdf['geometry'].apply(lambda geom: any(gdf_sindex.query(geom)))

hex_gdf['EspPublico'] = intersects_public_spaces.astype(int)
hex_gdf.head()

Unnamed: 0,hex_id,geometry,city,EspPublico
0,8b49aa2f0943fff,"POLYGON ((-103.29497 20.46059, -103.29479 20.4...",Guadalajara,0
1,8b498c94cd48fff,"POLYGON ((-103.35049 20.70770, -103.35032 20.7...",Guadalajara,0
2,8b49ab594ae1fff,"POLYGON ((-103.41820 20.60143, -103.41803 20.6...",Guadalajara,0
3,8b49ab5b2511fff,"POLYGON ((-103.40008 20.57395, -103.39991 20.5...",Guadalajara,0
4,8b498c955971fff,"POLYGON ((-103.27315 20.67382, -103.27297 20.6...",Guadalajara,0


In [22]:
contador_ceros = np.sum(hex_gdf['EspPublico'] == 0)
contador_uno = np.sum(hex_gdf['EspPublico'])
print(f'Números de hexagonos sin Espacios Públicos: {contador_ceros} \nNúmero de Hexágonos con Espacios Públicos: {contador_uno}\nNúmero total de Hexágonos: {contador_ceros+contador_uno}')

Números de hexagonos sin Espacios Públicos: 353125 
Número de Hexágonos con Espacios Públicos: 46875
Número total de Hexágonos: 400000


## Hexágonos con info de humedad, vegetación y temperatura

select {ndmi}.hex_id, {ndmi}.geometry, {ndmi}.city, {ndmi}.res,

ndvi_mean, ndvi_std, ndvi_median, ndvi_max, ndvi_min, ndvi_diff, ndvi_tend,

ndmi_mean, ndmi_median, ndmi_diff

from raster_analysis.ndmi_analysis_hex ndmi

inner join raster_analysis.ndvi_analysis_hex ndvi

on ndmi.hex_id = ndvi.hex_id 

where ndmi.city  in ('Queretaro','Guadalajara','ZMVM') and ndmi.res = 11

In [22]:
hex_schema = "raster_analysis"
hex_tablee = "ndvi_analysis_hex"
hex_table = "ndmi_analysis_hex"
ndmi = "ndmi"
ndvi = "ndvi"
hex_id = "hex_id"
c1 = 'Guadalajara'
c2 ='ZMVM'
c3 = 'Queretaro'
n = '11'
query = f'SELECT {ndmi}.hex_id, {ndmi}.geometry, {ndmi}.city, {ndmi}.res, ndvi_mean, ndvi_std, ndvi_median, ndvi_max, ndvi_min, ndvi_diff, ndvi_tend,ndmi_mean, ndmi_median, ndmi_diff FROM {hex_schema}.{hex_table} {ndmi} INNER JOIN {hex_schema}.{hex_tablee} {ndvi} ON {ndmi}.{hex_id} = {ndvi}.{hex_id} WHERE {ndmi}.\"city\" IN (\'{c1}\',\'{c2}\',\'{c3}\') AND {ndmi}.\"res\" = {n}'
intt_gdl = aup.gdf_from_query(query, geometry_col='geometry')
intt_gdl.head(10)


KeyboardInterrupt


KeyboardInterrupt



In [33]:
intt_gdl.shape

(799837, 14)

In [34]:
intt_gdl['city'].unique()

array(['Guadalajara', 'Queretaro', 'ZMVM'], dtype=object)

In [None]:
intt_gdll = np.sum(intt_gdl['city'] == 'Guadalajara')
intt_cdmxx = np.sum(intt_gdl['city'] == 'ZMVM')
intt_qroo = np.sum(intt_gdl['city'] == 'Queretaro')
print(f'Número de hexágonos Gdl: {intt_gdll} \nNúmero de hexágonos Cdmx: {intt_cdmxx} \nNúmero de hexágonos Qro: {intt_qroo}')

Como tenemos 800,000 hexágonos tenemos que reducir la dimensión a 400,000 porque mi compu no puede con tantos datos. Vamos a reducir la dimensión de las 3 ciudades a la mitad. Inicialmente, tenemos que Guadalajara tiene 400,000 hexágonos. Cdmx tiene 300,000 hexágonos y Querétaro tiene 100,000 hexágonos. Pero después de reducir las dimensiones serán 200,000; 150,000 y 50,000 respectivamente.

In [None]:
gdl_samples_int = intt_gdll/2
cdmx_samples_int = intt_cdmxx/2
qro_samples_int = intt_qroo/2

# Randomly sample 200,000 hexagons of Guadalajra
sample_with_Guadalajara = intt_gdl[intt_gdl['city'] == 'Guadalajara'].sample(n=gdl_samples_int, random_state=42)

# Randomly sample 150,000 hexagons of Cdmx
sample_with_Cdmx = intt_gdl[intt_gdl['city'] == 'ZMVM'].sample(n=cdmx_samples_int, random_state=42)

# Randomly sample 50,000 hexagons of Queretaro
sample_with_Queretaro = intt_gdl[intt_gdl['city'] == 'Queretaro'].sample(n=qro_samples_int, random_state=42)

# Concatenate the two samples to get the final subsample
int_gdf = pd.concat([sample_with_Guadalajara, sample_with_Cdmx, sample_with_Queretaro])
int_gdf.reset_index(drop = True, inplace = True)
int_gdf.head()

In [None]:
int_gdf.shape

Ahora que ya tenemos dimensiones más pequeñas ya podemos hacer la intersección.

## Intersección

In [13]:
int_gdf = int_gdf.set_crs("EPSG:4326")
hex_gdf = hex_gdf.set_crs("EPSG:4326")

In [None]:
inter = gpd.sjoin(hex_gdf, int_gdf, op='intersects')

In [None]:
inter.drop(columns=['index_right','city_right','res','hex_id_right'], inplace=True)
inter.rename(columns={"hex_id_left": "hex_id","city_left": "city"}, inplace=True)
inter.head()

In [None]:
pd.isna(inter).sum()