# Missing NDVI and/or temperature data in cities

This notebook identifies __(as of 2025-09-23) the cities were NDVI and/or temperature have not been yet processed and the amount of tiles (rasters) that comprise the area of interest__ in order to try the new raster download method (Joining the all the available images in a month when specific dates fail).

## __Import libraries__

In [1]:
from pathlib import Path
current_path = Path().resolve()
for parent in current_path.parents:
    if parent.name == "accesibilidad-urbana":
        module_path = str(parent)+'/'
        break
print(module_path)

/home/jovyan/accesibilidad-urbana/


In [2]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

if module_path not in sys.path:
    sys.path.append(module_path)
import aup

## __Load all cities__

In [3]:
# Read all cities
metro_gdf = aup.gdf_from_db('metro_gdf_2020', 'metropolis')
metro_gdf_lst = metro_gdf.city.unique()

# Show
print(len(metro_gdf_lst))
metro_gdf_lst

71


array(['Aguascalientes', 'Ensenada', 'Mexicali', 'Tijuana', 'La Paz',
       'Los Cabos', 'Campeche', 'Laguna', 'Monclova', 'Piedras Negras',
       'Saltillo', 'Colima', 'Tapachula', 'Tuxtla', 'Chihuahua',
       'Delicias', 'Juarez', 'CDMX', 'ZMVM', 'Durango', 'Celaya',
       'Guanajuato', 'Leon', 'Irapuato', 'Acapulco', 'Chilpancingo',
       'Pachuca', 'Tulancingo', 'Guadalajara', 'Vallarta', 'Piedad',
       'Toluca', 'Morelia', 'Zamora', 'Uruapan', 'Cuautla', 'Cuernavaca',
       'Tepic', 'Monterrey', 'Oaxaca', 'Puebla', 'San Martin', 'Tehuacan',
       'Queretaro', 'Cancun', 'Chetumal', 'Playa', 'SLP', 'Culiacan',
       'Los Mochis', 'Mazatlan', 'Guaymas', 'Ciudad Obregon',
       'Hermosillo', 'Nogales', 'Villahermosa', 'Victoria', 'Matamoros',
       'Nuevo Laredo', 'Reynosa', 'Tampico', 'Tlaxcala', 'Coatzacoalcos',
       'Cordoba', 'Minatitlan', 'Orizaba', 'Poza Rica', 'Veracruz',
       'Xalapa', 'Merida', 'Zacatecas'], dtype=object)

## __Identify already processed cities__

In [4]:
# NDVI
query = f"SELECT city FROM raster_analysis.ndvi_analysis_hex WHERE \"res\" = \'{8}\'"
processed_ndvi_city_lst = aup.df_from_query(query)
processed_ndvi_city_lst = list(processed_ndvi_city_lst.city.unique())

# Show
print(len(processed_ndvi_city_lst))
processed_ndvi_city_lst

49


['Acapulco',
 'Aguascalientes',
 'CDMX',
 'Chihuahua',
 'Chilpancingo',
 'Ciudad Obregon',
 'Colima',
 'Cuautla',
 'Cuernavaca',
 'Delicias',
 'Durango',
 'Ensenada',
 'Guadalajara',
 'Hermosillo',
 'Juarez',
 'Laguna',
 'La Paz',
 'Los Cabos',
 'Los Mochis',
 'Matamoros',
 'Mazatlan',
 'Mexicali',
 'Monclova',
 'Monterrey',
 'Nogales',
 'Nuevo Laredo',
 'Oaxaca',
 'Pachuca',
 'Piedras Negras',
 'Poza Rica',
 'Puebla',
 'Queretaro',
 'Reynosa',
 'San Martin',
 'Tapachula',
 'Tehuacan',
 'Tepic',
 'Tijuana',
 'Tlaxcala',
 'Toluca',
 'Tulancingo',
 'Uruapan',
 'Vallarta',
 'Victoria',
 'Villahermosa',
 'Xalapa',
 'Zacatecas',
 'Zamora',
 'ZMVM']

In [5]:
# Temperature
query = f"SELECT city FROM raster_analysis.temperature_analysis_hex WHERE \"res\" = \'{8}\'"
processed_temp_city_lst = aup.df_from_query(query)
processed_temp_city_lst = list(processed_temp_city_lst.city.unique())

# Show
print(len(processed_temp_city_lst))
processed_temp_city_lst

51


['Acapulco',
 'Aguascalientes',
 'CDMX',
 'Chihuahua',
 'Chilpancingo',
 'Ciudad Obregon',
 'Colima',
 'Cordoba',
 'Cuautla',
 'Cuernavaca',
 'Delicias',
 'Durango',
 'Guadalajara',
 'Guanajuato',
 'Guaymas',
 'Irapuato',
 'Juarez',
 'Laguna',
 'La Paz',
 'Leon',
 'Los Cabos',
 'Matamoros',
 'Mazatlan',
 'Merida',
 'Minatitlan',
 'Monterrey',
 'Nuevo Laredo',
 'Oaxaca',
 'Orizaba',
 'Pachuca',
 'Piedad',
 'Piedras Negras',
 'Puebla',
 'Queretaro',
 'Saltillo',
 'San Martin',
 'SLP',
 'Tapachula',
 'Tehuacan',
 'Tepic',
 'Tijuana',
 'Toluca',
 'Tulancingo',
 'Tuxtla',
 'Vallarta',
 'Veracruz',
 'Victoria',
 'Villahermosa',
 'Zacatecas',
 'Zamora',
 'ZMVM']

## __Identify cities missing processing__

In [6]:
# Summary df
df_missing = pd.DataFrame()
df_missing['city'] = ''
df_missing['ndvi'] = 0
df_missing['temp'] = 0

# Identification of missing cities
ndvi_diff = [x for x in metro_gdf_lst if x not in processed_ndvi_city_lst]
temp_diff = [x for x in metro_gdf_lst if x not in processed_temp_city_lst]

# Missing cities annotation
i = 0
for city in ndvi_diff:
    df_missing.loc[i,'city'] = city
    df_missing.loc[i,'ndvi'] = 1
    i+=1

for city in temp_diff:
    if city in ndvi_diff:
        df_missing.loc[df_missing.city==city,'temp'] = 1
    else:
        df_missing.loc[i,'city'] = city
        df_missing.loc[i,'temp'] = 1
        i+=1

# Show
print(df_missing.shape)
df_missing.head(2)

(33, 3)


Unnamed: 0,city,ndvi,temp
0,Campeche,1.0,1.0
1,Saltillo,1.0,


## __Count number of tiles that comprise each city's area of interest__

In [7]:
# Simplification extracted from function aup.download_raster_from_pc():
def analyse_raster_availability(gdf, projection_crs, start_date, end_date, freq, query, satellite, band_name_dict):

    # Set problem_type to none (Changes if high %missing_months or multiple missing months together)
    problem_type = 'No problem'

    # Create area of interest coordinates from hexagons to download raster data
    #print('Extracting bounding coordinates from hexagons')
    # Create buffer around hexagons
    poly = gdf.to_crs(projection_crs).buffer(500)
    poly = poly.to_crs("EPSG:4326")
    poly = gpd.GeoDataFrame(geometry=poly).dissolve().geometry
    # Extract coordinates from polygon as DataFrame
    coord_val = poly.bounds
    # Get coordinates for bounding box
    n = coord_val.maxy.max()
    s = coord_val.miny.min()
    e = coord_val.maxx.max()
    w = coord_val.minx.min()

    # Set the coordinates for the area of interest
    area_of_interest = {
        "type": "Polygon",
        "coordinates": [
            [
                [e, s],
                [w, s],
                [w, n],
                [e, n],
                [e, s],
            ]
        ],
    }

    # Create time of interest (Creates a list for all to-be-analysed-months with structure [start_day/end_day,(...)])
    #print('Defining time of interest')
    time_of_interest = aup.create_time_of_interest(start_date, end_date, freq=freq)
    # Gather items for time and area of interest (Creates of list of available image items)
    #print('Gathering items for time and area of interest')
    items = aup.gather_items(time_of_interest, area_of_interest, query=query, satellite=satellite)
    #print(f'Fetched {len(items)} items')

    # Count available tiles for area of interest (Creates a list of available tiles, inside create_raster_by_month() logs available tiles per date vs total of area of interest)
    aoi_tiles = []
    for i in items:
        # Retrieve current tile
        if satellite == "sentinel-2-l2a":
            tile = i.properties['s2:mgrs_tile']
        elif satellite == "landsat-c2-l2":
            tile = i.properties['landsat:wrs_path']+i.properties['landsat:wrs_row']
        # Append if first find
        if tile not in aoi_tiles:
            aoi_tiles.append(tile)

    #print('Checking available tiles for area of interest')
    date_list = aup.available_datasets(items, satellite, query)
    
    # Create dictionary from links (assets_hrefs is a dict. of dates and links with structure {available_date:{band_n:[link]}})
    band_name_list = list(band_name_dict.keys())[:-1]
    assets_hrefs = aup.link_dict(band_name_list, items, date_list)
    #print('Created dictionary from items')

    # Analyze available data according to raster properties (Creates df_len for the first time)
    df_len, missing_months = aup.df_date_links(assets_hrefs, start_date, end_date,
                                               band_name_list, freq)

    # Extracted from function aup.available_data_check():
    pct_limit=50
    window_limit=6
    
    pct_missing = round(missing_months/len(df_len),2)*100
    if pct_missing >= pct_limit:
        problem_type = 'more_than_50%_missing'
    df_rol = df_len['data_id'].rolling(window_limit).sum()
    # If any rolling window has a sum of 0, it means there are multiple missing months together
    if (df_rol == 0).any():
        problem_type = 'multiple_missing_together'
    del df_rol
    
    return aoi_tiles, problem_type, pct_missing

In [8]:
start_date = '2018-01-01'
end_date = '2023-12-31'
freq = 'MS'
res = [8,11]
projection_crs="EPSG:6372"
satellite = 'landsat-c2-l2'
sat_query = {"eo:cloud_cover": {"lt": 15},
             "platform": {"in": ["landsat-8", "landsat-9"]}}

# For ndvi
ndvi_band_name_dict = {'nir08':[False], #If GSD(resolution) of band is different, set True.
                       'red':[False], #If GSD(resolution) of band is different, set True.
                       'eq':['(nir08-red)/(nir08+red)']}
# For temperature
temp_band_name_dict = {'lwir11':[False],
                       'eq':["((lwir11*0.00341802) + 149.0)-273.15"]}

# Tests
city_test = ['Campeche']
#for city in city_test:
# All
for city in df_missing.city.unique():

    # ------------------------------ CREATE AREA OF INTEREST ------------------------------
    print(f"{city} - Creating area of interest.")
    ### Create city area of interest with biggest hexs
    big_res = min(res)
    schema_hex = 'hexgrid'
    table_hex = f'hexgrid_{big_res}_city_2020'
    
    # Download hexagons with hex_type --> urban
    hex_type = 'urban'
    query = f"SELECT hex_id_{big_res},geometry FROM {schema_hex}.{table_hex} WHERE \"city\" = '{city}\' AND \"type\" = '{hex_type}\'"
    hex_urban = aup.gdf_from_query(query, geometry_col='geometry')
    
    # Download hexagons with hex_type --> rural within 500m buffer
    poly = hex_urban.to_crs("EPSG:6372").buffer(500).reset_index()
    poly = poly.to_crs("EPSG:4326")
    poly_wkt = poly.dissolve().geometry.to_wkt()[0]
    hex_type = 'rural'
    query = f"SELECT hex_id_{big_res},geometry FROM {schema_hex}.{table_hex} WHERE \"city\" = '{city}\' AND \"type\" = '{hex_type}\' AND (ST_Intersects(geometry, \'SRID=4326;{poly_wkt}\'))"
    hex_rural = aup.gdf_from_query(query, geometry_col='geometry')
    
    # Concatenate urban and rural hex
    hex_city = pd.concat([hex_urban, hex_rural])

    # ------------------------------ ANALYSE RASTER AVAILABILITY AND REGISTER RESULTS ------------------------------ 
    print(f"{city} - Analysing raster availability for NDVI.")
    # Apply simplified function to count analyse NDVI tiles if missing NDVI in current city
    try:
        if df_missing.loc[df_missing.city==city,'ndvi'].unique()[0] == 1:
            aoi_tiles, problem_type, pct_missing = analyse_raster_availability(hex_city,
                                                                  projection_crs,
                                                                  start_date,
                                                                  end_date,
                                                                  freq,
                                                                  sat_query,
                                                                  satellite,
                                                                  band_name_dict = ndvi_band_name_dict
                                                                 )
            # Register NDVI analysed data
            df_missing.loc[df_missing.city==city,'ndvi_tiles'] = len(aoi_tiles)
            df_missing.loc[df_missing.city==city,'ndvi_problem'] = problem_type
            df_missing.loc[df_missing.city==city,'ndvi_%missing'] = pct_missing
    except:
        # Register other problem
        print(f"ERROR ANALYSING NDVI IN CITY {city}.")
        df_missing.loc[df_missing.city==city,'ndvi_tiles'] = 99
        df_missing.loc[df_missing.city==city,'ndvi_problem'] = 'other problem'
        df_missing.loc[df_missing.city==city,'ndvi_%missing'] = 99
    
    # Apply simplified function to count analyse TEMPERATURE tiles if missing TEMPERATURE in current city
    try:
        if df_missing.loc[df_missing.city==city,'temp'].unique()[0] == 1:
            print(f"{city} - Analysing raster availability for TEMPERATURE.")
            aoi_tiles, problem_type, pct_missing = analyse_raster_availability(hex_city,
                                                                  projection_crs,
                                                                  start_date,
                                                                  end_date,
                                                                  freq,
                                                                  sat_query,
                                                                  satellite,
                                                                  band_name_dict = temp_band_name_dict
                                                                 )
            # Register TEMPERATURE analysed data
            df_missing.loc[df_missing.city==city,'temp_tiles'] = len(aoi_tiles)
            df_missing.loc[df_missing.city==city,'temp_problem'] = problem_type
            df_missing.loc[df_missing.city==city,'temp_%missing'] = pct_missing
    except:
        # Register other problem
        print(f"ERROR ANALYSING TEMPERATURE IN CITY {city}.")
        df_missing.loc[df_missing.city==city,'temp_tiles'] = 99
        df_missing.loc[df_missing.city==city,'temp_problem'] = 'other problem'
        df_missing.loc[df_missing.city==city,'temp_%missing'] = 99

# Show
print(df_missing.shape)
df_missing.head(2)

Campeche - Creating area of interest.
Campeche - Analysing raster availability for NDVI.
Campeche - Analysing raster availability for TEMPERATURE.
Saltillo - Creating area of interest.
Saltillo - Analysing raster availability for NDVI.
Tuxtla - Creating area of interest.
Tuxtla - Analysing raster availability for NDVI.
Celaya - Creating area of interest.
Celaya - Analysing raster availability for NDVI.
Celaya - Analysing raster availability for TEMPERATURE.
Guanajuato - Creating area of interest.
Guanajuato - Analysing raster availability for NDVI.
Leon - Creating area of interest.
Leon - Analysing raster availability for NDVI.
Irapuato - Creating area of interest.
Irapuato - Analysing raster availability for NDVI.
Piedad - Creating area of interest.
Piedad - Analysing raster availability for NDVI.
Morelia - Creating area of interest.
Morelia - Analysing raster availability for NDVI.
Morelia - Analysing raster availability for TEMPERATURE.
Cancun - Creating area of interest.
Cancun - A

Unnamed: 0,city,ndvi,temp,ndvi_tiles,ndvi_problem,ndvi_%missing,temp_tiles,temp_problem,temp_%missing
0,Campeche,1.0,1.0,4.0,No problem,10.0,4.0,No problem,10.0
1,Saltillo,1.0,,4.0,No problem,12.0,,,


In [9]:
df_missing

Unnamed: 0,city,ndvi,temp,ndvi_tiles,ndvi_problem,ndvi_%missing,temp_tiles,temp_problem,temp_%missing
0,Campeche,1.0,1.0,4.0,No problem,10.0,4.0,No problem,10.0
1,Saltillo,1.0,,4.0,No problem,12.0,,,
2,Tuxtla,1.0,,2.0,No problem,40.0,,,
3,Celaya,1.0,1.0,4.0,No problem,10.0,4.0,No problem,10.0
4,Guanajuato,1.0,,4.0,No problem,22.0,,,
5,Leon,1.0,,4.0,No problem,22.0,,,
6,Irapuato,1.0,,4.0,No problem,22.0,,,
7,Piedad,1.0,,2.0,No problem,39.0,,,
8,Morelia,1.0,1.0,3.0,No problem,18.0,3.0,No problem,18.0
9,Cancun,1.0,1.0,3.0,No problem,11.0,99.0,other problem,99.0
