# Phase 2: Temporal Analysis

This notebook analyzes how groups move between stations and calculates temporal metrics.

## Prerequisites:
- Complete Phase 1 (Station Detection) first
- Phase 1 outputs must exist in `../data/phase1_results/{WORKSHOP}/`

## Objectives:
- Identify distinct station visits for each group
- Calculate dwell time (how long groups stay at each station)
- Calculate travel time between stations
- Analyze transition patterns

## Output:
- Station visits dataframe
- Travel times dataframe
- Transition matrices

## Workshop Selection

In [None]:
# ============================================
# WORKSHOP SELECTION
# ============================================
# Must match the workshop used in Phase 1
# Valid options: "Workshop1", "Workshop2", "Workshop3"

WORKSHOP = "Workshop1"  # üëà CHANGE THIS VALUE

# ============================================

print(f"üéØ Selected Workshop: {WORKSHOP}")
print(f"{'='*50}")
print(f"Loading Phase 1 results for {WORKSHOP}...")
print(f"{'='*50}\n")

## Setup and Load Phase 1 Results

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ Libraries imported successfully!")

In [None]:
# Load Phase 1 results
phase1_dir = Path(f'../data/phase1_results/{WORKSHOP}')

# Check if Phase 1 results exist
if not phase1_dir.exists():
    raise FileNotFoundError(
        f"Phase 1 results not found for {WORKSHOP}.\n"
        f"Please run phase1_station_detection.ipynb first!"
    )

# Load data with station assignments
df = pd.read_csv(phase1_dir / 'data_with_stations.csv')
df['time'] = pd.to_datetime(df['time'])

# Load station centroids
station_info = pd.read_csv(phase1_dir / 'station_centroids.csv')

# Load metadata
metadata = pd.read_csv(phase1_dir / 'phase1_metadata.csv').iloc[0]

print(f"\n{'='*60}")
print(f"üìä Loaded {WORKSHOP} Phase 1 Results")
print(f"{'='*60}")
print(f"Data points: {len(df):,}")
print(f"Number of stations: {metadata['optimal_k']}")
print(f"Groups: {sorted(df['name'].unique())}")
print(f"{'='*60}\n")

df.head()

## 2.1 Identify Station Visits

In [None]:
# Identify consecutive station visits for each group
def identify_visits(group_df):
    """
    Identify distinct station visits by detecting when station changes.
    A visit is a continuous period at the same station.
    """
    visits = []
    visit_id = 0
    
    if len(group_df) == 0:
        return []
    
    current_station = group_df.iloc[0]['station']
    visit_start_idx = 0
    
    for idx in range(1, len(group_df)):
        if group_df.iloc[idx]['station'] != current_station:
            # Station changed - save previous visit
            visit_data = group_df.iloc[visit_start_idx:idx]
            visits.append({
                'visit_id': visit_id,
                'station': current_station,
                'start_time': visit_data['time'].iloc[0],
                'end_time': visit_data['time'].iloc[-1],
                'duration_seconds': (visit_data['time'].iloc[-1] - visit_data['time'].iloc[0]).total_seconds(),
                'num_points': len(visit_data)
            })
            
            # Start new visit
            visit_id += 1
            current_station = group_df.iloc[idx]['station']
            visit_start_idx = idx
    
    # Add last visit
    visit_data = group_df.iloc[visit_start_idx:]
    visits.append({
        'visit_id': visit_id,
        'station': current_station,
        'start_time': visit_data['time'].iloc[0],
        'end_time': visit_data['time'].iloc[-1],
        'duration_seconds': (visit_data['time'].iloc[-1] - visit_data['time'].iloc[0]).total_seconds(),
        'num_points': len(visit_data)
    })
    
    return visits

# Apply to each group
all_visits = []

for group_name, group_df in df.groupby('name'):
    group_df = group_df.sort_values('time')
    visits = identify_visits(group_df)
    for visit in visits:
        visit['group'] = group_name
        all_visits.append(visit)

visits_df = pd.DataFrame(all_visits)
visits_df['duration_minutes'] = visits_df['duration_seconds'] / 60

print(f"Total station visits identified: {len(visits_df)}")
print(f"\nVisits per group:")
print(visits_df.groupby('group').size())

visits_df.head(10)

## 2.2 Calculate Dwell Time per Station

In [None]:
# Analyze dwell time at each station
dwell_stats = visits_df.groupby('station').agg({
    'duration_minutes': ['mean', 'median', 'std', 'min', 'max', 'count']
}).round(2)

dwell_stats.columns = ['Mean (min)', 'Median (min)', 'Std Dev', 'Min (min)', 'Max (min)', 'Visit Count']

print("Dwell Time Statistics by Station:")
print(dwell_stats)

# Visualize dwell time distribution
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Box plot
visits_df.boxplot(column='duration_minutes', by='station', ax=axes[0])
axes[0].set_title('Dwell Time Distribution by Station', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Station ID', fontsize=12)
axes[0].set_ylabel('Dwell Time (minutes)', fontsize=12)
axes[0].get_figure().suptitle('')  # Remove default title

# Bar plot of mean dwell time
mean_dwell = visits_df.groupby('station')['duration_minutes'].mean().sort_values()
mean_dwell.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('Average Dwell Time per Station', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Station ID', fontsize=12)
axes[1].set_ylabel('Average Dwell Time (minutes)', fontsize=12)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 2.3 Calculate Travel Time Between Stations

In [None]:
# Calculate travel time between consecutive stations
travel_times = []

for group, group_visits in visits_df.groupby('group'):
    group_visits = group_visits.sort_values('start_time')
    
    for i in range(len(group_visits) - 1):
        current_visit = group_visits.iloc[i]
        next_visit = group_visits.iloc[i + 1]
        
        travel_time_seconds = (next_visit['start_time'] - current_visit['end_time']).total_seconds()
        
        travel_times.append({
            'group': group,
            'from_station': current_visit['station'],
            'to_station': next_visit['station'],
            'travel_time_seconds': travel_time_seconds,
            'travel_time_minutes': travel_time_seconds / 60,
            'departure_time': current_visit['end_time'],
            'arrival_time': next_visit['start_time']
        })

travel_df = pd.DataFrame(travel_times)

print(f"Total station transitions: {len(travel_df)}")
print(f"\nTravel Time Summary:")
print(travel_df['travel_time_minutes'].describe())

travel_df.head(10)

In [None]:
# Create transition matrix (average travel time between stations)
transition_matrix = travel_df.groupby(['from_station', 'to_station'])['travel_time_minutes'].mean().unstack(fill_value=0)

print("Average Travel Time Matrix (minutes):")
print(transition_matrix.round(2))

# Visualize transition matrix
plt.figure(figsize=(10, 8))
sns.heatmap(transition_matrix, annot=True, fmt='.1f', cmap='YlOrRd', 
            cbar_kws={'label': 'Travel Time (minutes)'})
plt.title(f'{WORKSHOP}: Average Travel Time Between Stations', fontsize=14, fontweight='bold')
plt.xlabel('To Station', fontsize=12)
plt.ylabel('From Station', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Analyze most common transitions
transition_counts = travel_df.groupby(['from_station', 'to_station']).size().sort_values(ascending=False)

print("Most Common Station Transitions:")
print(transition_counts.head(10))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
transition_counts.head(15).plot(kind='barh', ax=ax, color='coral')
ax.set_title(f'{WORKSHOP}: Top 15 Most Frequent Station Transitions', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Transitions', fontsize=12)
ax.set_ylabel('Transition (From ‚Üí To)', fontsize=12)
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("=" * 70)
print(f"PHASE 2 TEMPORAL ANALYSIS SUMMARY - {WORKSHOP}")
print("=" * 70)

print(f"\nüìä Station Visits")
print(f"  ‚Ä¢ Total visits: {len(visits_df)}")
print(f"  ‚Ä¢ Average visit duration: {visits_df['duration_minutes'].mean():.1f} minutes")
print(f"  ‚Ä¢ Median visit duration: {visits_df['duration_minutes'].median():.1f} minutes")
print(f"  ‚Ä¢ Most visited station: {visits_df['station'].mode().values[0]}")

print(f"\nüö∂ Station Transitions")
print(f"  ‚Ä¢ Total transitions: {len(travel_df)}")
print(f"  ‚Ä¢ Average travel time: {travel_df['travel_time_minutes'].mean():.1f} minutes")
print(f"  ‚Ä¢ Median travel time: {travel_df['travel_time_minutes'].median():.1f} minutes")

print(f"\nüìç Per-Station Summary")
for station in sorted(visits_df['station'].unique()):
    station_visits = visits_df[visits_df['station'] == station]
    print(f"  Station {station}:")
    print(f"    - Visits: {len(station_visits)}")
    print(f"    - Avg dwell time: {station_visits['duration_minutes'].mean():.1f} min")

print("\n" + "=" * 70)

## Save Phase 2 Results

In [None]:
# Create output directory
output_dir = Path(f'../data/phase2_results/{WORKSHOP}')
output_dir.mkdir(parents=True, exist_ok=True)

# Save visits data
visits_df.to_csv(output_dir / 'station_visits.csv', index=False)

# Save travel times
travel_df.to_csv(output_dir / 'travel_times.csv', index=False)

# Save transition matrix
transition_matrix.to_csv(output_dir / 'transition_matrix.csv')

# Save dwell statistics
dwell_stats.to_csv(output_dir / 'dwell_statistics.csv')

# Save summary statistics
summary_stats = {
    'workshop': WORKSHOP,
    'total_visits': len(visits_df),
    'avg_dwell_time_min': visits_df['duration_minutes'].mean(),
    'median_dwell_time_min': visits_df['duration_minutes'].median(),
    'total_transitions': len(travel_df),
    'avg_travel_time_min': travel_df['travel_time_minutes'].mean(),
    'median_travel_time_min': travel_df['travel_time_minutes'].median()
}
pd.DataFrame([summary_stats]).to_csv(output_dir / 'phase2_summary.csv', index=False)

print(f"‚úÖ Phase 2 results saved to {output_dir}/")
print(f"\nSaved files:")
print(f"  ‚Ä¢ station_visits.csv - All station visits with durations")
print(f"  ‚Ä¢ travel_times.csv - Travel times between stations")
print(f"  ‚Ä¢ transition_matrix.csv - Avg travel time matrix")
print(f"  ‚Ä¢ dwell_statistics.csv - Per-station dwell statistics")
print(f"  ‚Ä¢ phase2_summary.csv - Overall summary statistics")
print(f"\nüéØ Proceed to phase3_predictive_modeling.ipynb")