### 1. data preprocessing

In [None]:
from decimal import Decimal
import os
import json
import pandas as pd

def load_coordinates(folder_path):
    all_rows = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        try:
                            entry = json.loads(line.strip())
                            coords = entry.get("coordinates")
                            if coords and isinstance(coords, list) and len(coords) == 2:
                                lon = Decimal(coords[0])
                                lat = Decimal(coords[1])
                                all_rows.append({
                                    "longitude": lon,
                                    "latitude": lat
                                })
                        except Exception as e:
                            print(f"Error parsing line in {filename}: {e}")
            except Exception as e:
                print(f"Error opening file {filename}: {e}")
    
    return pd.DataFrame(all_rows)

folder_path = "/Users/yunchi/Downloads/text_coordinates_regions"
df = load_coordinates(folder_path)

df

In [None]:
import folium

map_center = [df["latitude"].median(), df["longitude"].median()]
map = folium.Map(location=map_center, zoom_start=2, tiles='CartoDB positron')

for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=1,
        color='blue',
        fill=True,
        fill_opacity=0.6
    ).add_to(map)

# map.save('map.html')
map

In [None]:
from folium.plugins import HeatMap

heat_map = folium.Map(location=map_center, zoom_start=2)
heat_data = [[row['latitude'], row['longitude']] for _, row in df.iterrows()]
HeatMap(heat_data, radius=6, blur=5).add_to(heat_map)

heat_map.save('heat_map.html')
heat_map

### 2. topological analysis: SoftMapper

### 3. traditional clustering: DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

# convert coords to float to work with numpy
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# convert to radians (to be more realistic with distances)
coords = np.radians(df[['latitude', 'longitude']].to_numpy())

kms_per_radian = 6371.0088 # earth’s radius = 6371 km
eps_km = 10 # group points that are within 10 kilometers of each other
eps_rad = eps_km / kms_per_radian

db = DBSCAN(eps=eps_rad, min_samples=10, metric='haversine').fit(coords) # haversine -> measures spherical distance (great for coordinates)
df['cluster'] = db.labels_

In [None]:
coords

In [None]:
n_clusters = len(set(df['cluster'])) - (1 if -1 in df['cluster'].values else 0)
print(f"Number of clusters formed: {n_clusters}")

In [None]:
import matplotlib as plt

plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    df['longitude'], 
    df['latitude'], 
    c=df['cluster'], 
    cmap='tab10', 
    s=5, 
    alpha=0.6
)

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('DBSCAN Clustering of Geospatial Coordinates')
plt.colorbar(scatter, label='Cluster Label')
plt.grid(True)
plt.tight_layout()
plt.show()