*In this notebook, we cluster pings for each mobile, and we write a function to visualize results on a map. Then, we visualize clusters for the first mobile.*

# Import dataset

In [None]:
import pandas as pd
clean_mega_data = pd.read_csv("../datasets/clean_mega_data.csv")

# Cluster all points [DB scan]

In [None]:
"""
Use DBscan algorithm to cluster pings for each mobile. Note that a single ping can be its own cluster if it is isolated enough.
"""

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from tqdm import tqdm  # Progress bar

# Assuming clean_mega_data is already loaded
results = []

# DBSCAN parameters
eps = 50 / 111000  # 50 meters converted to degrees

# Loop through each unique CAID
for caid in tqdm(clean_mega_data['caid'].unique(), desc="Processing CAIDs"):
    data_filtered = clean_mega_data[clean_mega_data['caid'] == caid].reset_index(drop=True)

    if data_filtered.empty:
        continue

    # Extract coordinates
    coords = data_filtered[['latitude', 'longitude']].values.astype(np.float32)

    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=1).fit(coords)
    data_filtered['cluster'] = dbscan.labels_

    # Calculate centroids
    centroid_dict = data_filtered.groupby('cluster')[['latitude', 'longitude']].mean().to_dict('index')

    # Add centroid addresses to data
    data_filtered['centroid_address'] = data_filtered['cluster'].map(centroid_dict)

    results.append(data_filtered)

# Combine results into one DataFrame
final_data = pd.concat(results).reset_index(drop=True)

# Save to CSV
final_data.to_csv('clustered_data.csv', index=False)

In [None]:
final_data.head()


Unnamed: 0,caid,zipcode,utc_timestamp,latitude,longitude,horizontal_accuracy,region,ping_near_replicate_matches,quarter,cluster,centroid_address
0,3924b2d36e1b036021dd5cc9ccabf33e20ba55e0f3a531...,90301,2024-02-14 05:33:33.000,33.95756,-118.37116,33.0,california,1,2,0,"{'latitude': 33.95752090909091, 'longitude': -..."
1,3924b2d36e1b036021dd5cc9ccabf33e20ba55e0f3a531...,90301,2024-02-14 06:18:44.000,33.95456,-118.35538,5.0,california,1,2,1,"{'latitude': 33.954546, 'longitude': -118.3553..."
2,3924b2d36e1b036021dd5cc9ccabf33e20ba55e0f3a531...,90301,2024-02-14 05:12:58.000,33.95747,-118.37079,12.0,california,1,2,0,"{'latitude': 33.95752090909091, 'longitude': -..."
3,3924b2d36e1b036021dd5cc9ccabf33e20ba55e0f3a531...,90301,2024-02-14 05:33:38.000,33.95754,-118.37114,33.0,california,1,2,0,"{'latitude': 33.95752090909091, 'longitude': -..."
4,3924b2d36e1b036021dd5cc9ccabf33e20ba55e0f3a531...,90301,2024-02-14 06:42:13.000,33.95672,-118.35954,5.0,california,1,2,2,"{'latitude': 33.956678, 'longitude': -118.3595..."


# Visualize results (function)

In [4]:
import pandas as pd
final_data = pd.read_csv('../datasets/clustered_data.csv')

In [5]:
final_data.shape

(10332575, 11)

In [6]:
import pandas as pd
import folium
import random  # For color generation

def visualize_caid(data, selected_caid):
    # Filter data for the selected CAID
    data_filtered = data[data['caid'] == selected_caid].reset_index(drop=True)

    if data_filtered.empty:
        print(f"No data found for CAID {selected_caid}")
        return

    # Extract centroid dictionary
    centroid_dict = data_filtered.groupby('cluster')[['latitude', 'longitude']].mean().to_dict('index')

    # Step 7: Visualize with Folium
    map_center = [data_filtered['latitude'].mean(), data_filtered['longitude'].mean()]
    m = folium.Map(location=map_center, zoom_start=12)

    # Generate distinct colors for each cluster
    cluster_colors = {cluster_id: f'#{random.randint(0, 0xFFFFFF):06x}' for cluster_id in centroid_dict.keys()}

    # Add all pings with cluster-specific colors
    for idx, row in data_filtered.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=3,
            color=cluster_colors[row['cluster']],
            fill=True,
            fill_opacity=0.6,
            popup=f"Ping {idx}<br>Cluster {row['cluster']}"
        ).add_to(m)

    # Add cluster centroids as red markers
    for cluster_id, centroid in centroid_dict.items():
        folium.Marker(
            location=[centroid['latitude'], centroid['longitude']],
            icon=folium.Icon(color='red', icon='home'),
            popup=f"Cluster {cluster_id}<br>Centroid: {centroid}"
        ).add_to(m)

    print(f"Selected CAID: {selected_caid}")
    print(f"Filtered Data Size: {data_filtered.shape}")
    print(f"Clusters Found: {len(centroid_dict)}")
    print(f"Centroids: {centroid_dict}")
    print(f"Map Center: {map_center}")

    return m






# Visualize first caid

In [11]:
# Visualize for the first CAID
first_caid = final_data['caid'].unique()[220]  # Get the first CAID
visualize_caid(final_data, first_caid)

Selected CAID: eb825d402c58ad560a6cef19a0840aaa8aa8b5a118a431c03cc15dfd05487c96
Filtered Data Size: (102, 11)
Clusters Found: 10
Centroids: {0: {'latitude': 34.169926999999994, 'longitude': -118.26993966666667}, 1: {'latitude': 34.14358958333333, 'longitude': -118.19105541666666}, 2: {'latitude': 34.112777161290325, 'longitude': -118.26858483870967}, 3: {'latitude': 34.14101333333333, 'longitude': -118.14754}, 4: {'latitude': 34.076953333333336, 'longitude': -118.2489}, 5: {'latitude': 34.09064769230769, 'longitude': -118.21174353846155}, 6: {'latitude': 34.09025666666667, 'longitude': -118.2121}, 7: {'latitude': 34.095762, 'longitude': -118.20762914285714}, 8: {'latitude': 34.094967600000004, 'longitude': -118.20805800000001}, 9: {'latitude': 34.14224, 'longitude': -118.14581}}
Map Center: [np.float64(34.12129282352941), np.float64(-118.22910319607841)]
