# Phase 1: Station Boundary Detection

This notebook identifies distinct stations/zones where groups spend time during the workshop using clustering algorithms.

## Objectives:
- Visualize raw RTLS data
- Apply K-Means clustering to identify stations
- Use DBSCAN as an alternative clustering method
- Export station centroids and assignments

## Output:
- Station centroids (coordinates)
- Data with station assignments
- Clustering model and scaler

## Workshop Selection

In [None]:
# ============================================
# WORKSHOP SELECTION
# ============================================
# Change this to analyze a different workshop
# Valid options: "Workshop1", "Workshop2", "Workshop3"

WORKSHOP = "Workshop1"  # ðŸ‘ˆ CHANGE THIS VALUE

# ============================================

print(f"ðŸŽ¯ Selected Workshop: {WORKSHOP}")
print(f"{'='*50}")
print(f"This analysis will run on {WORKSHOP} data only.")
print(f"{'='*50}\n")

## Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("âœ… Libraries imported successfully!")

In [None]:
# Load data for the selected workshop
data_file = f'../data/raw/{WORKSHOP}.csv'

print(f"Loading data from: {data_file}")
df = pd.read_csv(data_file)

# Convert time to datetime
df['time'] = pd.to_datetime(df['time'])

# Sort by group and time
df = df.sort_values(['name', 'time']).reset_index(drop=True)

print(f"\n{'='*60}")
print(f"ðŸ“Š {WORKSHOP} Dataset Summary")
print(f"{'='*60}")
print(f"Dataset Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Date Range: {df['time'].min()} to {df['time'].max()}")
print(f"Groups in this workshop: {sorted(df['name'].unique())}")
print(f"Number of groups: {df['name'].nunique()}")
print(f"Total Data Points: {len(df):,}")
print(f"{'='*60}\n")

df.head(10)

In [None]:
# Basic data exploration
print("Coordinate Statistics:")
print(df[['x', 'y', 'z']].describe())

print("\nData Points per Group:")
print(df.groupby('name').size().to_frame('count'))

## 1.1 Visualize Raw Data

In [None]:
# Visualize all data points in 2D space
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: All points colored by group
for group in df['name'].unique():
    group_data = df[df['name'] == group]
    axes[0].scatter(group_data['x'], group_data['y'], alpha=0.3, s=10, label=group)
axes[0].set_xlabel('X Coordinate (m)')
axes[0].set_ylabel('Y Coordinate (m)')
axes[0].set_title('All Data Points by Group')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(True, alpha=0.3)

# Plot 2: Density heatmap
axes[1].hexbin(df['x'], df['y'], gridsize=30, cmap='YlOrRd', mincnt=1)
axes[1].set_xlabel('X Coordinate (m)')
axes[1].set_ylabel('Y Coordinate (m)')
axes[1].set_title('Location Density Heatmap')
plt.colorbar(axes[1].collections[0], ax=axes[1], label='Point Density')

plt.tight_layout()
plt.show()

print("High-density areas likely represent stations where groups spent significant time.")

## 1.2 K-Means Clustering: Find Optimal Number of Stations

In [None]:
# Prepare clustering data (use X, Y coordinates)
coords = df[['x', 'y']].values

# Standardize coordinates for better clustering
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# Sample data for faster computation (if dataset is large)
sample_size = min(20000, len(df))
sample_idx = np.random.choice(len(df), sample_size, replace=False)
coords_sample = coords_scaled[sample_idx]

print(f"Using {sample_size:,} sample points for optimal k analysis")

In [None]:
# Elbow method and silhouette analysis
k_range = range(2, 16)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(coords_sample)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(coords_sample, labels))
    print(f"k={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette_scores[-1]:.3f}")

# Plot results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow curve
axes[0].plot(k_range, inertias, 'bo-', linewidth=2)
axes[0].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[0].set_ylabel('Inertia (Within-cluster Sum of Squares)', fontsize=12)
axes[0].set_title('Elbow Method', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Silhouette scores
axes[1].plot(k_range, silhouette_scores, 'ro-', linewidth=2)
axes[1].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Analysis', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
best_k_silhouette = k_range[np.argmax(silhouette_scores)]
axes[1].axvline(x=best_k_silhouette, color='green', linestyle='--', label=f'Best k={best_k_silhouette}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nâœ… Recommended number of stations (clusters): {best_k_silhouette}")
print(f"   Silhouette score: {max(silhouette_scores):.3f}")

## 1.3 Apply K-Means with Optimal k

In [None]:
# Use the optimal k (or manually set based on domain knowledge)
optimal_k = best_k_silhouette  # You can override this: optimal_k = 6

# Fit K-Means on full dataset
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['station'] = kmeans.fit_predict(coords_scaled)

# Get station centroids in original scale
centroids_scaled = kmeans.cluster_centers_
centroids_original = scaler.inverse_transform(centroids_scaled)

# Create station info dataframe
station_info = pd.DataFrame({
    'station': range(optimal_k),
    'centroid_x': centroids_original[:, 0],
    'centroid_y': centroids_original[:, 1]
})

print("Station Centroids:")
print(station_info)
print(f"\nPoints per station:")
print(df['station'].value_counts().sort_index())

In [None]:
# Visualize stations
fig, ax = plt.subplots(figsize=(14, 10))

# Plot all points colored by station
scatter = ax.scatter(df['x'], df['y'], c=df['station'], cmap='tab10', alpha=0.4, s=20)

# Plot centroids
ax.scatter(station_info['centroid_x'], station_info['centroid_y'], 
           c='red', marker='X', s=500, edgecolors='black', linewidths=2,
           label='Station Centroids', zorder=5)

# Add station labels
for idx, row in station_info.iterrows():
    ax.annotate(f'Station {idx}', 
                xy=(row['centroid_x'], row['centroid_y']),
                xytext=(10, 10), textcoords='offset points',
                fontsize=12, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7))

ax.set_xlabel('X Coordinate (m)', fontsize=12)
ax.set_ylabel('Y Coordinate (m)', fontsize=12)
ax.set_title(f'{WORKSHOP}: Detected Stations (K-Means, k={optimal_k})', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax, label='Station ID')
plt.tight_layout()
plt.show()

## 1.4 Alternative: DBSCAN Clustering (Density-Based)

In [None]:
# Apply DBSCAN
dbscan = DBSCAN(eps=0.3, min_samples=50)  # Adjust eps and min_samples based on data
df['station_dbscan'] = dbscan.fit_predict(coords_scaled)

n_clusters_dbscan = len(set(df['station_dbscan'])) - (1 if -1 in df['station_dbscan'] else 0)
n_noise = list(df['station_dbscan']).count(-1)

print(f"DBSCAN Results:")
print(f"  Number of stations detected: {n_clusters_dbscan}")
print(f"  Noise points (not assigned to any station): {n_noise}")
print(f"\nPoints per station:")
print(df['station_dbscan'].value_counts().sort_index())

In [None]:
# Visualize DBSCAN results
fig, ax = plt.subplots(figsize=(14, 10))

# Plot points (noise points in black)
unique_labels = set(df['station_dbscan'])
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

for label, color in zip(unique_labels, colors):
    if label == -1:
        color = 'black'
        marker_size = 10
        alpha = 0.3
        label_text = 'Noise'
    else:
        marker_size = 20
        alpha = 0.5
        label_text = f'Station {label}'
    
    mask = df['station_dbscan'] == label
    ax.scatter(df[mask]['x'], df[mask]['y'], 
               c=[color], s=marker_size, alpha=alpha, label=label_text)

ax.set_xlabel('X Coordinate (m)', fontsize=12)
ax.set_ylabel('Y Coordinate (m)', fontsize=12)
ax.set_title(f'{WORKSHOP}: DBSCAN Station Detection ({n_clusters_dbscan} stations)', 
             fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nðŸ’¡ Choose between K-Means and DBSCAN based on your domain knowledge.")
print("   Continuing with K-Means stations for subsequent phases.")

## Save Phase 1 Results

In [None]:
# Create output directory for this workshop
output_dir = Path(f'../data/phase1_results/{WORKSHOP}')
output_dir.mkdir(parents=True, exist_ok=True)

# Save data with station assignments
df[['name', 'x', 'y', 'z', 'time', 'station', 'station_dbscan']].to_csv(
    output_dir / 'data_with_stations.csv', index=False
)

# Save station centroids
station_info.to_csv(output_dir / 'station_centroids.csv', index=False)

# Save K-Means model and scaler
with open(output_dir / 'kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open(output_dir / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save metadata
metadata = {
    'workshop': WORKSHOP,
    'optimal_k': optimal_k,
    'silhouette_score': max(silhouette_scores),
    'dbscan_clusters': n_clusters_dbscan,
    'total_data_points': len(df)
}
pd.DataFrame([metadata]).to_csv(output_dir / 'phase1_metadata.csv', index=False)

print(f"âœ… Phase 1 results saved to {output_dir}/")
print(f"\nSaved files:")
print(f"  â€¢ data_with_stations.csv - Data with station assignments")
print(f"  â€¢ station_centroids.csv - Station centroid coordinates")
print(f"  â€¢ kmeans_model.pkl - Trained K-Means model")
print(f"  â€¢ scaler.pkl - StandardScaler for coordinates")
print(f"  â€¢ phase1_metadata.csv - Clustering metadata")
print(f"\nðŸŽ¯ Proceed to phase2_temporal_analysis.ipynb")