In [1]:
import os
import ee
import geemap
import socket
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import box 
from shapely.geometry import Polygon
import re # regular expressions
import folium
from folium import GeoJson

def setup_directories():
    # check if we are on the server or local
    nodename = socket.gethostname()
    if nodename == "oMac.local": # local laptop
        root = os.path.expanduser("~/OneDrive - The University of Chicago/guatamala_ag/data")
    else:
        raise Exception("Unknown environment, Please specify the root directory")

    dirs = {
        'root': root,
        'raw': os.path.join(root, "raw"),
        'processed': os.path.join(root, "processed"),
        'fig': os.path.join(root, "../figures")
    }

    for path in dirs.values():
        os.makedirs(path, exist_ok=True)

    return dirs

dir = setup_directories()

## Clean Coordinate data and then convert to geopandas

In [2]:
def add_decimal_if_missing(coord):
    if isinstance(coord, str) and coord.replace('-', '').isdigit():
        if coord.startswith('-'):
            return f"-{coord[1:3]}.{coord[3:]}"
        else:
            return f"{coord[:2]}.{coord[2:]}"
    return coord

def is_valid_guatemala_coordinate(lat, lon):
    # Approximate bounding box for Guatemala
    return 13.1 <= lat <= 18.2 and -93 <= lon <= -88.0

def fix_known_coordinate_issues(df):
    """
    Fix known coordinate issues in the dataframe using approximate floating-point comparisons.
    """
    known_fixes = {
        ('longitude_4', 90.366662): -90.366662,
        ('latitude_1', 5.267439): 15.267439,
        ('longitude_1', -16.352620): -90.1635620,
    }
    
    for (col, incorrect_value), correct_value in known_fixes.items():
        # Use numpy's isclose for approximate floating-point comparison
        mask = np.isclose(df[col], incorrect_value, rtol=1e-5, atol=1e-8)
        if mask.any():
            df.loc[mask, col] = correct_value
            print(f"Fixed {mask.sum()} occurrences of approximately {incorrect_value} to {correct_value} in {col}")
    
    return df

def split_coordinates(coord_str):
    # Existing manual fixes
    manual_fixes = {
        "16,3870407, -89,7345351": "16.3870407, -89.7345351",
        "14.177150.3,-90.3989608": "14.1771503, -90.3989608",
        "16,3869101, -89,7344694": "16.3869101, -89.7344694",
        "14.141996-90-147208": "14.141996, -90.147208",
        "16,3871767, -89,7348127": "16.3871767, -89.7348127",
        "16,3869863, -89,7349780": "16.3869863, -89.7349780"
    }
    
    if coord_str in manual_fixes:
        coord_str = manual_fixes[coord_str]
    
    # Remove any quotation marks and leading/trailing whitespace
    cleaned = coord_str.strip().strip('"')
    
    # Try to match various patterns
    patterns = [
        r'^([-]?\d+\.?\d*)[,\s]+([-]?\d+\.?\d*)$',  # Comma or space separated
        r'^([-]?\d+\.?\d*)\.([-]?\d+\.?\d*)$',      # Period separated
        r'^(\d+\.?\d*)(-\d+\.?\d*)$'                # No separator with negative longitude
    ]
    
    for pattern in patterns:
        # if we get passed the first pattern, remove all whitespace
        cleaned = re.sub(r'\s', '', cleaned)
        match = re.match(pattern, cleaned)
        if match:
            lat, lon = match.group(1), match.group(2)
            lat = add_decimal_if_missing(lat)
            lon = add_decimal_if_missing(lon)
            return pd.Series({'latitude': lat, 'longitude': lon})
    
    # If we couldn't split it, return empty strings
    print(f"Could not split coordinates: {coord_str}")
    return pd.Series({'latitude': '', 'longitude': ''})

df = pd.read_excel(os.path.join(dir['raw'], "Datos de Impacto Productores 2023.xlsx"), 
    sheet_name= 0, skiprows=4)
vars_to_keep = ["id_phone", "id_coordinates_1", "id_coordinates_2", 
                "id_coordinates_3", "id_coordinates_4"]
df = df[vars_to_keep]
# drop rows with missing id_coordinates_1
df = df.dropna(subset=["id_coordinates_1"])


# Process coordinates
for i in range(1, 5):
    col_name = f'id_coordinates_{i}'
    new_cols = df[col_name].apply(split_coordinates)
    df[f'latitude_{i}'] = pd.to_numeric(new_cols['latitude'], errors='coerce')
    df[f'longitude_{i}'] = pd.to_numeric(new_cols['longitude'], errors='coerce')
    
    # Check if coordinates are within Guatemala's range
    df[f'valid_coordinate_{i}'] = df.apply(
        lambda row: is_valid_guatemala_coordinate(row[f'latitude_{i}'], row[f'longitude_{i}']), 
        axis=1
    )

# Fix known coordinate issues
df = fix_known_coordinate_issues(df)

# Recheck validity after fixes
for i in range(1, 5):
    df[f'valid_coordinate_{i}'] = df.apply(
        lambda row: is_valid_guatemala_coordinate(row[f'latitude_{i}'], row[f'longitude_{i}']), 
        axis=1
    )

# Print summary of remaining invalid coordinates
for i in range(1, 5):
    invalid_coords = df[~df[f'valid_coordinate_{i}']]
    if not invalid_coords.empty:
        print(f"\nRemaining invalid coordinates for id_coordinates_{i}:")
        print(invalid_coords[[f'latitude_{i}', f'longitude_{i}']])

# Check if all coordinates are valid
all_valid = df.apply(lambda row: all(row[f'valid_coordinate_{i}'] for i in range(1, 5)), axis=1)
print(f"\nTotal rows with all valid coordinates: {all_valid.sum()} out of {len(df)}")

# Save the processed data
df.to_csv(os.path.join(dir['processed'], "coordinates_processed.csv"), index=False)
print("\nData processing complete. Results saved to 'coordinates_processed.csv'.")

Fixed 1 occurrences of approximately 90.366662 to -90.366662 in longitude_4
Fixed 1 occurrences of approximately 5.267439 to 15.267439 in latitude_1
Fixed 1 occurrences of approximately -16.35262 to -90.163562 in longitude_1

Total rows with all valid coordinates: 125 out of 125

Data processing complete. Results saved to 'coordinates_processed.csv'.


  warn(msg)


In [3]:
# create lat and lon min and max columns
df['lat_min'] = df[['latitude_1', 'latitude_2', 'latitude_3', 'latitude_4']].min(axis=1)
df['lat_max'] = df[['latitude_1', 'latitude_2', 'latitude_3', 'latitude_4']].max(axis=1)
df['lon_min'] = df[['longitude_1', 'longitude_2', 'longitude_3', 'longitude_4']].min(axis=1)
df['lon_max'] = df[['longitude_1', 'longitude_2', 'longitude_3', 'longitude_4']].max(axis=1)

# Function to create a polygon from min/max coordinates
# jury is out on which is better
def create_polygon(row):
    return box(row['lon_min'], row['lat_min'], row['lon_max'], row['lat_max'])

# Alternative Function to create a polygon from coordinates
# def create_polygon(row):
#     coords = [
#         (float(row['longitude_1']), float(row['latitude_1'])),
#         (float(row['longitude_2']), float(row['latitude_2'])),
#         (float(row['longitude_3']), float(row['latitude_3'])),
#         (float(row['longitude_4']), float(row['latitude_4'])),
#         (float(row['longitude_1']), float(row['latitude_1']))  # Close the polygon
#     ]
#     return Polygon(coords)


# Create the geometry column
df['geometry'] = df.apply(create_polygon, axis=1)

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Function to calculate area in square meters
def calculate_area(geometry, lat):
    # Define a local projection centered on the polygon
    local_azimuthal_projection = f"+proj=aeqd +lat_0={lat}\
        +lon_0={geometry.centroid.x} +x_0=0 +y_0=0"
    
    # Create a GeoSeries with the input geometry and set its CRS
    geoseries = gpd.GeoSeries([geometry], crs="EPSG:4326")
    
    # Project the GeoSeries to the local azimuthal equidistant projection
    projected_geoseries = geoseries.to_crs(local_azimuthal_projection)
    
    # Get the projected geometry and calculate its area
    projected_geometry = projected_geoseries.iloc[0]

    area = projected_geometry.area

    # check if area is nan
    if pd.isna(area):
        print(f"Area is nan for {geometry}")
        # print the lat and lon
        print(f"Lat: {lat}, Lon: {geometry.centroid.x}")
        return

    return area 

# Calculate area for each polygon
gdf['area_sqm'] = gdf.apply(lambda row: calculate_area(row['geometry'], 
                            row['geometry'].centroid.y), axis=1)

# count number of rows with area over 1 million sqm
print(f"Number of rows with area over 500k sqm: {len(gdf[gdf['area_sqm'] > 500_000])}")

# drop rows with area over 0.5 million sqm (5k by 5k meters)
gdf = gdf[gdf['area_sqm'] < 500_000]

# Display info about the GeoDataFrame
print(f"\nGeoDataFrame shape: {gdf.shape}")
print(f"GeoDataFrame CRS: {gdf.crs}")

# print summary statistics for area and round to 2 decimal places
print(gdf['area_sqm'].describe().round(2))

Number of rows with area over 500k sqm: 8

GeoDataFrame shape: (117, 23)
GeoDataFrame CRS: EPSG:4326
count       117.00
mean      12407.17
std       31590.46
min           0.00
25%        1884.04
50%        4174.59
75%       10579.19
max      243667.68
Name: area_sqm, dtype: float64


## Make Interactive Plot to Look at the Parcels

In [4]:
# Ensure the GeoDataFrame is in WGS84 (EPSG:4326) for Folium
gdf = gdf.to_crs(epsg=4326)

# Calculate the center of the map
center_lat = gdf.geometry.centroid.y.mean()
center_lon = gdf.geometry.centroid.x.mean()

# Create a Folium map centered on your data
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add a satellite imagery tile layer
folium.TileLayer(
    tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
    attr='Esri',
    name='Satellite Imagery',
    overlay=False,
    control=True
).add_to(m)

# Function to style the GeoJson features
def style_function(feature):
    return {
        'fillColor': 'red',
        'color': 'red',
        'weight': 2,
        'fillOpacity': 0.5,
    }

# Add the GeoDataFrame to the map
GeoJson(
    gdf,
    style_function=style_function,
    tooltip=folium.GeoJsonTooltip(fields=['area_sqm'],  # Replace with your column name
                                  aliases=['Area:'],
                                  style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;"))
).add_to(m)

# Add Layer Control
folium.LayerControl().add_to(m)

# Save the map
m.save(os.path.join(dir["fig"], 'guatamala_farm_parcel_map.html'))

print("Interactive map has been saved as 'interactive_map.html'")


Interactive map has been saved as 'interactive_map.html'



  center_lat = gdf.geometry.centroid.y.mean()

  center_lon = gdf.geometry.centroid.x.mean()


In [12]:
import ee
import folium
from folium import GeoJson
from folium.plugins import MarkerCluster
from shapely.geometry import mapping

# Initialize Earth Engine
ee.Initialize()

# Function to get Sentinel-2 imagery
def get_sentinel2_imagery(geometry, start_date, end_date):
    ee_geometry = ee.Geometry.Polygon(list(geometry.exterior.coords))

    s2_collection = (ee.ImageCollection('COPERNICUS/S2_SR')
                        .filterBounds(ee_geometry)
                        .filterDate(start_date, end_date)
                        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)))
    if s2_collection.size().getInfo() == 0:
            return None

    s2_rgb = s2_collection.select(['B4', 'B3', 'B2'])
    s2_median = s2_rgb.median()
    s2_clipped = s2_median.clip(ee_geometry)

    return s2_clipped

# Calculate the center of the map
center_lat = gdf.geometry.centroid.y.mean()
center_lon = gdf.geometry.centroid.x.mean()

# Create a Folium map centered on your data
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add a satellite imagery tile layer
folium.TileLayer(
    tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
    attr='Esri',
    name='Satellite Imagery',
    overlay=False,
    control=True
).add_to(m)

# Function to style the GeoJson features
def style_function(feature):
    return {
        'fillColor': 'red',
        'color': 'red',
        'weight': 2,
        'fillOpacity': 0.5,
    }

# Create a MarkerCluster to improve performance
marker_cluster = MarkerCluster().add_to(m)

# Process each polygon
for idx, row in gdf.iterrows():
    geometry = row['geometry']
    
    # Get Sentinel-2 imagery
    s2_image = get_sentinel2_imagery(geometry, '2023-01-01', '2023-12-31')
    
    if s2_image is not None:
        # Get the map ID and token for the Sentinel-2 image
        map_id_dict = s2_image.getMapId({'min': 0, 'max': 3000, 'gamma': 1.4})
        
        # Add the Sentinel-2 image as a TileLayer
        folium.TileLayer(
            tiles=map_id_dict['tile_fetcher'].url_format,
            attr='Google Earth Engine',
            name=f'Sentinel-2 Image {idx}',
            overlay=True,
            show=False
        ).add_to(m)

    # Create a GeoJSON-like structure for the polygon
    geojson_data = {
        "type": "Feature",
        "geometry": mapping(geometry),
        "properties": {
            "area_sqm": row['area_sqm']
        }
    }


    # Add the polygon to the map
    GeoJson(
        geojson_data,
        style_function=style_function,
        tooltip=folium.GeoJsonTooltip(fields=['area_sqm'],
                                      aliases=['Area:'],
                                      style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;"))
    ).add_to(marker_cluster)    # Add the polygon to the map

# Add Layer Control
folium.LayerControl().add_to(m)

# Save the map
m.save(os.path.join(dir["fig"], 'guatamala_farm_parcel_map_with_sentinel.html'))

print("Interactive map with Sentinel-2 imagery has been saved as 'guatamala_farm_parcel_map_with_sentinel.html'")

AttributeError: 'Polygon' object has no attribute 'to_json'

In [7]:
# Create a map centered on the test geometry
m = folium.Map(location=[test_geometry.centroid.y, test_geometry.centroid.x], zoom_start=15)

# Get the map ID and token for the Sentinel-2 image
map_id_dict = test_s2_image.getMapId({'min': 0, 'max': 3000, 'gamma': 1})

# Add the Sentinel-2 image as a TileLayer
folium.TileLayer(
    tiles=map_id_dict['tile_fetcher'].url_format,
    attr='Google Earth Engine',
    name='Sentinel-2 Image',
    overlay=True,
    control=True
).add_to(m)

# Add the polygon outline
folium.GeoJson(
    test_geometry,
    style_function=lambda x: {'fillColor': 'none', 'color': 'red', 'weight': 2}
).add_to(m)

# Add layer control
folium.LayerControl().add_to(m)

# Display the map
m