This notebook prepares the ebutterfly data, not co-located with ebird), starts with clustering the observations, creates polygons to extract the satellite images from planetary computer, filters images that are smaller than 128x128, creates the targets by aggregating the checklists, saves final csv for the hotspots

In [None]:
import pandas as pd 
import geopandas as gpd
import os
import numpy as np
import glob
from pathlib import Path    
from tqdm import tqdm

from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from shapely.geometry import Polygon, Point
from math import cos, radians

In [None]:
root_dir = "/ebutterfly/Darwin/0177350-230224095556074"
dataset_tag = "ebutterfly_data_v3"

In [None]:
buttefly_data_US = pd.read_csv(os.path.join(root_dir, "occ_usa.csv"))

print(buttefly_data_US)

# Clustering ebutterfly data

In [None]:
RADIUS_EARTH = 6356.7523 

coordinates = buttefly_data_US[['decimalLatitude', 'decimalLongitude']].values

eps = 1/RADIUS_EARTH # Maximum distance between points to be considered part of the same cluster
min_samples = 2  # Minimum number of points in a cluster (including the core point)

db = DBSCAN(eps=eps, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coordinates))

cluster_labels = db.labels_

# Number of clusters in labels, ignoring noise (-1 is noise)
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
num_noise = len(set(cluster_labels)) - num_clusters
print("Number of clusters:", num_clusters)
print("Number of noise:", num_noise)

clusters = pd.Series([coordinates[cluster_labels == n] for n in range(num_clusters)])

# print(clusters)
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

centermost_points = clusters.map(get_centermost_point)
center_lats, center_lons = zip(*centermost_points)

# save final dataframe
butterfly_data_US_clustered = buttefly_data_US
butterfly_data_US_clustered["cluster_label"] = cluster_labels

butterfly_data_US_clustered = butterfly_data_US_clustered[butterfly_data_US_clustered["cluster_label"] != -1]
print(butterfly_data_US_clustered)
cluster_labels = cluster_labels[np.where(cluster_labels != -1)]

butterfly_data_US_clustered["center_lat"] = [center_lats[cl] for cl in cluster_labels]
butterfly_data_US_clustered["center_lon"] = [center_lons[cl] for cl in cluster_labels]
butterfly_data_US_clustered["hotspot_id"] = ["L" + str(cl) for cl in cluster_labels]

butterfly_data_US_clustered.reset_index(drop=True)

print(butterfly_data_US_clustered)
butterfly_data_US_clustered.to_csv(os.path.join(root_dir, dataset_tag, "butterfly_data_clustered.csv"))

# Generate images

In [None]:
center_data_df = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_data_clustered.csv"), usecols=["hotspot_id", "center_lon", "center_lat"])

In [None]:
center_data_df = center_data_df.drop_duplicates().reset_index()

In [None]:
center_data_df

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point


fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)

colors = {'train': 'b', 'test':'y', 'valid':'m'}

ax.scatter(x=center_data_df['center_lon'], y=center_data_df['center_lat'], color='grey')
ax.scatter(x=buttefly_data_US['decimalLongitude'], y=buttefly_data_US['decimalLatitude'], color='red')

plt.show()

ax.set_title('Coordinates on USA Map')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.show()

### 1. Create polygons for the lats, lons

In [None]:
geometry = [Point(xy) for xy in zip(center_data_df['center_lon'], center_data_df['center_lat'])]

In [None]:
geometry

In [None]:
crs = {'init':'epsg:4326'}

geo_df = gpd.GeoDataFrame(center_data_df,
                          crs=crs,
                          geometry=geometry)

In [None]:
geo_df

In [None]:
def generate_buffer_meter(data, radius, geometry='geometry', crs='epsg:4326', projected_crs='epsg:3857'): 
    """ Generates a buffer around the geometries in a geopandas DataFrame. 
    Parameters: 
        data (GeoDataFrame or DataFrame): The geopandas dataframe or a pandas dataframe that contains geometry data. 
        radius (float): The radius of the buffer in meters. 
        geometry (str, optional): The column in the dataframe that contains the geometry information. Defaults to 'geometry'. 
        crs (str, optional): The Coordinate Reference System of the input geometries. Defaults to 'epsg:4326'. 
        projected_crs (str, optional): The projected CRS to use for buffering. Defaults to 'epsg:3857'. 
    Returns: 
        GeoDataFrame: A new geopandas dataframe with the buffer applied to the geometry. 
    """ 
    data = gpd.GeoDataFrame(data) 
    data = data.to_crs(projected_crs)
    data[geometry] = data[geometry].buffer(radius, cap_style=3)
    data = data.to_crs(crs)
    return data

In [None]:
data_df = generate_buffer_meter(geo_df, 2500)

In [None]:
data_df.shape

In [None]:
data_df["geometry"].value_counts()

In [None]:
data_df.iloc[0]["geometry"].area

In [None]:
copy_df = data_df[data_df["geometry"].area == 0]

In [None]:
copy_df

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point


fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)

colors = {'train': 'b', 'test':'y', 'valid':'m'}

ax.scatter(x=data_df['center_lon'], y=data_df['center_lat'], color='grey')
ax.scatter(x=copy_df['center_lon'], y=copy_df['center_lat'], color='red')

plt.show()

ax.set_title('Coordinates on USA Map')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

plt.show()

In [None]:
data_df["geometry"].unique()

In [None]:
data_df

In [None]:
data_df.to_csv(os.path.join(root_dir, dataset_tag, "ebutterfly_center_polygons.csv"))

### 2. use the polygons file to extract satellite images from planetary compute, using the script (data_processing/ebutterfly_data_preparation/download_rasters_from_planetary_computer.py)

### 3. Filter satellite images and save final

In [None]:
import glob
import random
import matplotlib.pyplot as plt
import rasterio as rio
import numpy as np

file_list = glob.glob(os.path.join(root_dir, dataset_tag, "raw_images/*"))

# Select 8 random files from the list
random_files = random.sample(file_list, 8)

fig, axes = plt.subplots(2, 4, figsize=(12, 6))

for i, file_path in enumerate(random_files):
    with rio.open(file_path) as f:
        r = f.read(3)
        g = f.read(2)
        b = f.read(1)
    
    # Create a composite image from RGB channels
#     print(composite.shape)
    composite = np.stack((r, g, b), axis=-1)
    print(composite.shape)
    
    # Clip and normalize the values
    normalized_composite = np.clip((composite / 10000), 0, 1)
    
    # Get the title from the file name
    title = file_path.split("/")[-1]
    
    # Plot the image in the corresponding subplot
    ax = axes[i // 4, i % 4]
    ax.imshow(normalized_composite)
    ax.set_title(title)
    ax.axis('off')
# Adjust spacing and display the plot
plt.tight_layout()
plt.show()

In [None]:
# exclude images less than 128x128
import shutil
from pathlib import Path    

dst = os.path.join(root_dir, dataset_tag, "images")
file_list = glob.glob(os.path.join(root_dir, dataset_tag, "raw_images/*"))

for i, file_path in enumerate(file_list):
    with rio.open(file_path) as f:
        r = f.read(3)
        g = f.read(2)
        b = f.read(1)
    composite = np.stack((r, g, b), axis=-1)
    if composite.shape[0] >= 128 and composite.shape[1] >= 128:
        shutil.copy(file_path, dst)

In [None]:
final_hotspots = []
file_list = glob.glob(os.path.join(root_dir, dataset_tag, "images/*"))
for i, file_path in enumerate(file_list):
    final_hotspots.append(str(Path(file_path).name.split(".")[0]))

In [None]:
len(final_hotspots)

In [None]:
final_hotspots

# Create final csv

In [None]:
butterfly_df = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_data_clustered.csv"))

In [None]:
butterfly_df = butterfly_df[butterfly_df['hotspot_id'].isin(final_hotspots)]

In [None]:
grouped_butterfly_data = butterfly_df.groupby(['hotspot_id'])
group_sizes = grouped_butterfly_data.size()
print(group_sizes)

In [None]:
# save species list of all unique species

species_list = butterfly_df["species"].unique().tolist()
print(species_list)
print(len(species_list))

species_df = butterfly_df['species'].value_counts()

species_df = species_df.reset_index()
species_df.columns = ['species', 'frequency']

species_df.to_csv(os.path.join(root_dir, dataset_tag, 'species_list.csv'), index=False)

print(species_df)

In [None]:
# create targets by aggregating checklists
import json
from tqdm import tqdm

for group_name, group_data in tqdm(grouped_butterfly_data):
    print(group_name, group_data['eventID'], group_data['occurrenceID'],
          group_data['taxonID'], group_data['decimalLatitude'], group_data['decimalLongitude'])
    target = {}
    checklist_ = np.zeros(len(species_list))
    for sp in group_data["species"]:
        checklist_[species_list.index(sp)] += 1
    target['num_complete_checklists'] = len(group_data['eventID'].unique())
    checklist_ = checklist_ / target['num_complete_checklists']
    target['probs'] = checklist_.tolist()
    target['hotspot_id'] = group_name

    with open(os.path.join(root_dir, dataset_tag, 'butterfly_targets', str(group_name) + '.json'), 'w') as fp:
        json.dump(target, fp)

In [None]:
# save final csv
# columns: hotspot_name, lon, lat, number_of_observations, number_of_unique_checklists, number_of_unique_species, env variables
hotspot_ids = []
lats, lons = [], []
number_of_butterfly_obs = []
number_of_unique_checklists = []
number_of_different_species = []
states = []

bio_env_column_names = ['bio_1', 'bio_2', 'bio_3', 'bio_4', 'bio_5',
       'bio_6', 'bio_7', 'bio_8', 'bio_9', 'bio_10', 'bio_11', 'bio_12',
       'bio_13', 'bio_14', 'bio_15', 'bio_16', 'bio_17', 'bio_18', 'bio_19']
ped_env_column_names = ['bdticm', 'bldfie', 'cecsol', 'clyppt', 'orcdrc', 'phihox', 'sltppt', 'sndppt']
location_info = ['county', 'stateProvince', 'countryCode']

for group_name, group_data in tqdm(grouped_butterfly_data):
    hotspot_ids.append("L" + str(int(group_name)))
    lats.append(group_data['center_lat'].iloc[0])
    lons.append(group_data['center_lon'].iloc[0])
    states.append(group_data['stateProvince'].iloc[0])
    number_of_butterfly_obs.append(len(group_data['occurrenceID']))
    number_of_unique_checklists.append(len(group_data['eventID'].unique()))
    number_of_different_species.append(len(group_data['species'].unique()))

final_data_frame = pd.DataFrame({'hotspot_id': hotspot_ids,
                                 'lat': lats,
                                 'lon': lons,
                                 'county_code': states,
                                 'ebutterfly_occurances': number_of_butterfly_obs,
                                 'num_checklists': number_of_unique_checklists,
                                 'num_species': number_of_different_species})

print(final_data_frame)

final_data_frame.to_csv(os.path.join(root_dir, dataset_tag, 'butterfly_hotspots.csv') , index=False)

### split data using DBSCAN (script: make_splits_by_distance.py)

In [None]:
butterfly_data_with_split = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_with_splits.csv"))

In [None]:
butterfly_data_with_split

In [None]:
grouped_butterfly_data = butterfly_data_with_split.groupby(['split'], as_index=False)

for group_name, group_data in tqdm(grouped_butterfly_data):
    print(group_name)
    print(group_data["ebutterfly_occurances"].max())
    group_data.to_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_" + str(group_name) + ".csv"))

### Visualize map after splitting

In [None]:
import os
import sys
from pathlib import Path

sys.path.append(str(Path().resolve().parent))
sys.path.append(str(Path().resolve().parent.parent))

import numpy as np
import pandas as pd
import json

import geopandas as gpd
import matplotlib.pyplot as plt

from shapely.geometry import Point

path = os.path.join(root_dir, dataset_tag, "butterfly_hotspots_with_splits.csv")

df = pd.read_csv(path)
df = df.drop_duplicates("hotspot_id")

geoDatav = gpd.read_file('https://raw.githubusercontent.com/holtzy/The-Python-Graph-Gallery/master/static/data/US-counties.geojson')

geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)   


train = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_train.csv"))
val = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_valid.csv"))
test = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_test.csv"))
gdf["split"] = ""

idx = gdf[gdf["hotspot_id"].isin(list(train["hotspot_id"]))].index
gdf.loc[idx,"split"] = "train"

idx = gdf[gdf["hotspot_id"].isin(list(val["hotspot_id"]))].index
gdf.loc[idx,"split"] = "val"
idx = gdf[gdf["hotspot_id"].isin(list(test["hotspot_id"]))].index
gdf.loc[idx,"split"] = "test"

ig, ax = plt.subplots(figsize =(15,10))
#train_gdf.drop_duplicates(["geometry"]).boundary.plot(ax = ax, alpha = 0.4, edgecolor = "gray")
geoDatav[~geoDatav["STATE"].isin(["02", "15"])].boundary.plot(ax=ax, alpha = 0.1, edgecolor = "gray" )
gdf[gdf["split"]=="train"].plot(ax=ax,marker='o', color='mediumslateblue', markersize=1, label = "train")
gdf[gdf["split"]=="val"].plot(ax=ax, marker='o', color='lightseagreen', markersize=1, label = "val")
gdf[gdf["split"]=="test"].plot(ax=ax, marker='o', color='lightsalmon', markersize=1, label = "test")

plt.legend(fontsize=16, markerscale=5,loc='lower right',  bbox_to_anchor=(0.92, 0.25))
plt.title("butterfly Hotspots")
plt.show()

Final files saved:
[('valid', 1164), ('test', 1166), ('train', 5436)]
- butterfly_hotspots.csv
- butterfly_hotspots_train.csv
- butterfly_hotspots_valid.csv
- butterfly_hotspots_test.csv
- species_list.csv
- targets/