# Python workshop - 2025

<div>
    <img src="../images/qcbs_logo_v2.svg" style="background-color: #f0f0f0; padding: 20px;"/>
</div>

<div>
    <img src="../images/python_logo_generic.svg" style="background-color: #f0f0f0; padding: 20px;"/>
</div>

**Last update**: 2025-05-19  
**Author**: El-Amine Mimouni  
**Affiliation**: Qu√©bec Centre for Biodiversity Science

**Overview**: In this notebook, we will see how to use GeoPandas.

---

# GeoPandas

## General information


Information about GeoPandas can be found at: [https://geopandas.org/en/stable/index.html](https://geopandas.org/en/stable/index.html)


In [45]:
# The star of today
import geopandas as gpd

# Supporting actors
import pandas as pd
import shapely
import pyproj
import folium
import pyogrio
import requests

# Building a GeoDataFrame

In [46]:
# Creating a GeoDataFrame by hand

# Define nonspatial data
my_nonspatial_df = pd.DataFrame(data=[{"some_variable": 38}])

# Define geometry
my_geometry = shapely.geometry.Point(-62.2159, -3.4653)

# Define CRS
# This time, use the .from_epsg() method
my_crs = pyproj.CRS.from_epsg(4326)

# Look at it!
print("Nonspatial data:")
print(my_nonspatial_df)
print(type(my_nonspatial_df))

print("\nGeometry:")
print(my_geometry)
print(type(my_geometry))

print("\nCRS:")
print(my_crs)
print(type(my_crs))

Nonspatial data:
   some_variable
0             38
<class 'pandas.core.frame.DataFrame'>

Geometry:
POINT (-62.2159 -3.4653)
<class 'shapely.geometry.point.Point'>

CRS:
EPSG:4326
<class 'pyproj.crs.crs.CRS'>


In [47]:
# Gotta give the geometry as a list or gpd not happy!
my_gdf = gpd.GeoDataFrame(data=my_nonspatial_df, geometry=[my_geometry], crs=my_crs)

# Look at it!

print("A complete GeoDataFrame:")
print(my_gdf)
print(type(my_gdf))

A complete GeoDataFrame:
   some_variable                  geometry
0             38  POINT (-62.2159 -3.4653)
<class 'geopandas.geodataframe.GeoDataFrame'>


In [49]:
# Use the .explore() method of the GeoDataFrame
my_gdf.explore(zoom_start=5).save(outfile="my_gdf.html")

In [None]:
# See what type of object it is
my_explore = my_gdf.explore()

# Look at it!
print("The my_explore object:")
print(my_explore)
print(type(my_explore))

In [None]:
# Write your GeoDataFrame to file
my_gdf.to_file(filename="../data/my_gdf.geojson", driver="GeoJSON")

In [None]:
# See the various formats GeoPandas can read from and to
pyogrio.list_drivers()

# Loading spatial data

In [None]:
# Download amphibian distribution areas
# From Quebec Ministry
# amph_gdf = gpd.read_file(filename="https://diffusion.mffp.gouv.qc.ca/Diffusion/DonneeGratuite/Faune/Aires_repartition/Amphibien/SQLite/Aires_repartition_amphibiens.sqlite")
amph_gdf = gpd.read_file("/vsicurl/https://diffusion.mffp.gouv.qc.ca/Diffusion/DonneeGratuite/Faune/Aires_repartition/Amphibien/SQLite/Aires_repartition_amphibiens.sqlite")

# Web landing page:
# https://www.donneesquebec.ca/recherche/dataset/aires-de-repartition-faune/resource/4e32d8a8-113a-466c-994d-8b052f5c669c

In [None]:
# See the first lines of the object
# No UTF8
print(type(amph_gdf))
(amph_gdf.head())

In [None]:
# See distribution of the first species
amph_gdf.iloc[0:1].explore()

In [None]:
# Create and append a new (it already exists but we want to show)
# This is a Pandas feature combined with shapely geometry attributes
amph_gdf["area_now"] = amph_gdf.geometry.area

# Look at it!
amph_gdf.head()

In [None]:
# Since it is an extension of Pandas DataFrame, you can still use similar functions
amph_gdf.groupby("famille")["area_now"].agg(["min", "max", "std", "count"])

# Clipping

In [None]:
# Define a bounding box centered over northern Quebec
my_bbox = shapely.geometry.box(minx=-80.18, miny=51.24, maxx=-70.32, maxy=61.14)

# Look at it!
print(my_bbox)
print(type(my_bbox))

In [None]:
# Create a GeoDataFrame
boxy = gpd.GeoDataFrame(data=[{"name": "boxy"}], geometry=[my_bbox], crs="EPSG:4326")
boxy

In [None]:
# Look at the bounding box
boxy.explore().save(outfile="toto.html")

In [None]:
# Clip the random polygons with the box
amph_clipped = amph_gdf.clip(mask=boxy, sort=True)

# Note: Equivalent to
# amph_clipped = gpd.clip(gdf=amph_gdf, mask=boxy, sort=True)

In [None]:
# Everything went BAD, look at the first few lines
amph_clipped.head()

In [None]:
# AN ON PURPOSE ERROR!
# Reproject the amphibian distribution areas geometries to the
# appropriate CRS (EPSG:4326).
# Do it in the same line to avoid creating additional objects
amph_clipped = amph_gdf.to_crs(epsg=4326).clip(mask=boxy, sort=True)

# Note: Equivalent to
# amph_clipped = gpd.clip(gdf=amph_gdf.to_crs(epsg=4326), mask=boxy, sort=True)

In [None]:
# Everything went well, look at the first few lines
amph_clipped.head()

In [None]:
# Have a look at the data and review folium concepts
my_map = boxy.explore(color="red", alpha=0.75)

# Add original data
folium.GeoJson(data=amph_gdf[0:1].to_crs(epsg=4326).geometry, tooltip="I was not clipped!", color="purple", alpha=0.75).add_to(parent=my_map)

# Add clipped data
folium.GeoJson(data=amph_clipped[0:1].geometry, tooltip="But I was!", color="blue", alpha=0.75).add_to(parent=my_map)

# Show the map
my_map

In [None]:
# Have a look at the data and review folium concepts (part 2!!!)
my_map = boxy.explore(color="red", alpha=0.75)

# Add data outside the box
folium.GeoJson(data=amph_gdf[2:3].to_crs(epsg=4326).geometry, tooltip="I was outside the box!", color="purple", alpha=0.75).add_to(parent=my_map)

# Show the map
my_map

# Spatial joins

In [None]:
# Download the shapefile for australia
r = requests.get(url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/STE_2021_AUST_SHP_GDA2020.zip")
with open("../data/STE_2021_AUST_SHP_GDA2020.zip", "wb") as f:
    f.write(r.content)

# Now read with geopandas
# gdf = gpd.read_file(f"zip://{os.path.abspath(local_zip)}")
gdf_aus = gpd.read_file(f"zip://../data/STE_2021_AUST_SHP_GDA2020.zip")
print(gdf_aus.crs)
print(gdf_aus.head())

In [None]:
# Get Australian territories and boundaries
# gdf_aus = gpd.read_file(filename="https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/STE_2021_AUST_SHP_GDA2020.zip")
gdf_aus = gpd.read_file("zip://https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/STE_2021_AUST_SHP_GDA2020.zip")
# amph_gdf = gpd.read_file("/vsicurl/https://diffusion.mffp.gouv.qc.ca/Diffusion/DonneeGratuite/Faune/Aires_repartition/Amphibien/SQLite/Aires_repartition_amphibiens.sqlite")


# Look 
print(gdf_aus.head())

# Have a look at the shape of the GDF
# Note: I hate you last line...
print(gdf_aus.shape)

# Web landing page:
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files

In [None]:
# Have a look at the geometries
gdf_aus.explore()

In [None]:
# Have a look at the CRS
print("CRS of the Australian digital boundaries:")
print(gdf_aus.crs)
print(gdf_aus.crs.is_projected)
print(gdf_aus.crs.area_of_use)
print(type(gdf_aus.crs))

In [None]:
# Read in GBIF data
desert_pea = pd.read_csv(filepath_or_buffer="../data/desert_pea.csv")

# Look at the first 5 lines
desert_pea.head()

In [None]:
# Convert to GeoDataFrame in one go using from_xy
#
gdf_pea = gpd.GeoDataFrame(data=desert_pea["eventId"],
                           geometry=gpd.points_from_xy(x=desert_pea["decimalLongitude"],
                                                       y=desert_pea["decimalLatitude"],
                                                       crs=4326))

# Look at it!
print(gdf_pea.head())

In [None]:
# Now convert it to the same CRS as that of the Australian digital boundaries data
gdf_pea.to_crs(crs=7844, inplace=True)

# Check that it has been correctly changed
print(gdf_pea.crs)

In [None]:
# Now you can perform spatial joins between `gdf_pea` and `gdf_aus`
# For example, join the points with the Australian territories based on location
joined_gdf = gpd.sjoin(left_df=gdf_pea, right_df=gdf_aus, how="left", predicate="within")

# Display the resulting GeoDataFrame
print(joined_gdf)

In [None]:
# Look at the column names "a la Pandas"
print(joined_gdf.columns)

In [None]:
# VERY IMPORTANT COLUMN
# INDEX_RIGHT
# The ever important "index_right" column gives you
# the index of the right geometries based on the predicate
# you mentionned

print(joined_gdf.index_right)
print(type(joined_gdf.index_right))

In [None]:
# The ever important "index_right" column gives you
# the index of the right geometries based on the predicate
# you mentionned
print(joined_gdf.index_right.value_counts())
#
print(joined_gdf.index_right.value_counts().sum())

# We lost two along the way... (more later)

In [None]:
#  Create a map
# Use gdf_aus.explore() as a base
mapy = gdf_aus.explore()

#for idx, point in joined_gdf.iterrows():
for _, point in joined_gdf.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        # For extra coolness, put the name of the variable in bold HTML tags
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="green", prefix="fa", icon="seedling"),
#        tooltip=f"<b>eventId</b>: {point.eventId}"
    ).add_to(mapy)

# Display the map
mapy

In [None]:
# See which points did not make the cut and why
sad_points = joined_gdf[joined_gdf["index_right"].isna()]

# Which ones?
print(sad_points)

In [None]:
# Create another map
mapo = gdf_aus.explore()

#for idx, point in sad_points.iterrows():
for _, point in sad_points.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        # For extra coolness, put the name of the variable in bold HTML tags
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="red", prefix="fa", icon="plant-wilt")
    ).add_to(mapo)

# Display the map
mapo

In [None]:
# Use the previously seen Pandas DataFrame .groupby() method to determine
# in which territory the observations fall
territory_counts = joined_gdf.groupby("STE_NAME21").size()

# Step 5: Print the counts of observations per territory
print("\nUnsorted values:")
print(territory_counts)

# Note: You could also sort the counts in descending order
print("\nSorted values:")
print(territory_counts.sort_values(ascending=False))

# Case 2: Dugongs

In [None]:
# This time quick
df_dugong = pd.read_csv(filepath_or_buffer="../data/sea_cow.csv")
gdf_dugong = gpd.GeoDataFrame(data=df_dugong["eventId"], geometry=gpd.points_from_xy(df_dugong["decimalLongitude"], df_dugong["decimalLatitude"], crs=4326))

# Get also Australian marine parks
gdf_marineparks = gpd.read_file(filename="https://hub.arcgis.com/api/v3/datasets/2b3eb1d42b8d4319900cf4777f0a83b9_0/downloads/data?format=shp&spatialRefId=4283&where=1%3D1")

# Web landing page:
# https://fed.dcceew.gov.au/datasets/erin::australian-marine-parks/about

In [None]:
# Safety check: verify both CRS (you set the first one though...)
print("Dugong sightings CRS:", gdf_dugong.crs)
#
print("\nAustralian marine parks CRS:", gdf_marineparks.crs)

In [None]:
# Reproject both GeoDataFrames to the same projected CRS (EPSG:3577)
gdf_dugong.to_crs(epsg=3577, inplace=True)
gdf_marineparks.to_crs(epsg=3577, inplace=True)

# Safety check:
print("Dugong sightings CRS:", gdf_dugong.crs)
#
print("\nAustralian marine parks CRS:", gdf_marineparks.crs)

In [None]:
# STOOOOOOOOOP!
# Check the units of the CRS
pyproj.CRS.from_string("EPSG:3577").axis_info
# Proceed

In [None]:
# Create a buffer around the marine park.
# For example, 40 km buffer, seems like a reasonable dugong-ish area (verify)
gdf_mpbuff = gpd.GeoDataFrame(data=gdf_marineparks,
                              geometry=gdf_marineparks.buffer(distance=40000),
                              crs=3577)

# Have a look at the data
gdf_mpbuff.head()

In [None]:
# Have a look at the data
gdf_mpbuff.explore()

In [None]:
# Perform a spatial join to find dugong sightings within the buffer
join_dugong = gpd.sjoin(left_df=gdf_dugong,
                        right_df=gdf_mpbuff,
                        how="left",
                        predicate="within")

# Look at the dimensions of the joined DataFrame
# NONONONO
print(join_dugong.shape)

In [None]:
# Be careful!
# Joins are done per cominations!!!!
join_dugong.dropna(subset="index_right")["eventId"].value_counts()

# Here, some 1s are actual 1s

In [None]:
# Extract happy dugongs and sad dugongs
happy_dugongs = join_dugong.dropna(subset=["index_right"]).copy()

# Avoid getting duplicate eventId values with .drop_duplicates()
# This is just for a map
happy_dugongs.drop_duplicates(subset="eventId", keep="first")
happy_dugongs.to_crs(4326, inplace=True)
#
sad_dugongs = join_dugong[join_dugong["index_right"].isna()].copy()
sad_dugongs.to_crs(4326, inplace=True)

# Map them both on the same map
mapa = gdf_mpbuff.explore()

for _, point in happy_dugongs.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="green", prefix="fa", icon="hippo")
    ).add_to(mapa)

for _, point in sad_dugongs.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="red", prefix="fa", icon="hippo")
    ).add_to(mapa)

# Display the map
mapa

In [None]:
join_dugong.dropna(subset="index_right").groupby(by=["ZONEIUCN", "ZONENAME"]).size()