# Uber Movement Data: Traffic Speed Prediction - Data Exploration

This notebook explores the Uber Movement traffic data and prepares it for machine learning models.

## Overview
- Data loading and initial exploration
- Traffic patterns analysis
- Feature engineering
- Data visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries loaded successfully!")

## 1. Data Loading

In [None]:
# Load traffic data
traffic_data = pd.read_csv('../data/raw/san_francisco_traffic_data.csv')
segments_data = pd.read_csv('../data/raw/san_francisco_segments.csv')

print(f"Traffic data shape: {traffic_data.shape}")
print(f"Segments data shape: {segments_data.shape}")

# Convert timestamp to datetime
traffic_data['timestamp'] = pd.to_datetime(traffic_data['timestamp'])

print("\nData loaded successfully!")

In [None]:
# Basic data info
print("=== Traffic Data Info ===")
print(traffic_data.info())
print("\n=== First 5 rows ===")
traffic_data.head()

In [None]:
# Basic statistics
print("=== Traffic Data Statistics ===")
traffic_data.describe()

## 2. Data Quality Assessment

In [None]:
# Check for missing values
print("=== Missing Values ===")
missing_data = traffic_data.isnull().sum()
print(missing_data)

# Check for duplicates
duplicates = traffic_data.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Date range
print(f"\nDate range: {traffic_data['timestamp'].min()} to {traffic_data['timestamp'].max()}")
print(f"Total time span: {traffic_data['timestamp'].max() - traffic_data['timestamp'].min()}")

In [None]:
# Speed distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(traffic_data['speed_mph'], bins=50, alpha=0.7, color='skyblue')
axes[0].set_xlabel('Speed (mph)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Speed Distribution')
axes[0].axvline(traffic_data['speed_mph'].mean(), color='red', linestyle='--', label=f"Mean: {traffic_data['speed_mph'].mean():.1f}")
axes[0].legend()

# Box plot
axes[1].boxplot(traffic_data['speed_mph'])
axes[1].set_ylabel('Speed (mph)')
axes[1].set_title('Speed Box Plot')

plt.tight_layout()
plt.show()

print(f"Speed statistics:")
print(f"Mean: {traffic_data['speed_mph'].mean():.2f} mph")
print(f"Median: {traffic_data['speed_mph'].median():.2f} mph")
print(f"Std: {traffic_data['speed_mph'].std():.2f} mph")
print(f"Min: {traffic_data['speed_mph'].min():.2f} mph")
print(f"Max: {traffic_data['speed_mph'].max():.2f} mph")

## 3. Temporal Patterns Analysis

In [None]:
# Hourly patterns
hourly_stats = traffic_data.groupby('hour')['speed_mph'].agg(['mean', 'std', 'count']).reset_index()

fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Average speed by hour
axes[0].plot(hourly_stats['hour'], hourly_stats['mean'], marker='o', linewidth=2, markersize=6)
axes[0].fill_between(hourly_stats['hour'], 
                     hourly_stats['mean'] - hourly_stats['std'], 
                     hourly_stats['mean'] + hourly_stats['std'], 
                     alpha=0.3)
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Average Speed (mph)')
axes[0].set_title('Average Speed by Hour of Day')
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(range(0, 24, 2))

# Data points count by hour
axes[1].bar(hourly_stats['hour'], hourly_stats['count'], alpha=0.7, color='orange')
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Number of Observations')
axes[1].set_title('Data Coverage by Hour')
axes[1].set_xticks(range(0, 24, 2))

plt.tight_layout()
plt.show()

# Identify rush hours
print("\n=== Rush Hour Analysis ===")
morning_rush = hourly_stats[(hourly_stats['hour'] >= 7) & (hourly_stats['hour'] <= 9)]['mean'].mean()
evening_rush = hourly_stats[(hourly_stats['hour'] >= 17) & (hourly_stats['hour'] <= 19)]['mean'].mean()
off_peak = hourly_stats[(hourly_stats['hour'] >= 10) & (hourly_stats['hour'] <= 16)]['mean'].mean()

print(f"Morning rush (7-9 AM): {morning_rush:.1f} mph")
print(f"Evening rush (5-7 PM): {evening_rush:.1f} mph")
print(f"Off-peak (10 AM-4 PM): {off_peak:.1f} mph")
print(f"Rush hour impact: {((off_peak - morning_rush) / off_peak * 100):.1f}% speed reduction")

In [None]:
# Day of week patterns
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_stats = traffic_data.groupby('day_of_week')['speed_mph'].agg(['mean', 'std', 'count']).reset_index()
dow_stats['day_name'] = [dow_names[i] for i in dow_stats['day_of_week']]

plt.figure(figsize=(12, 6))
bars = plt.bar(dow_stats['day_name'], dow_stats['mean'], 
               yerr=dow_stats['std'], capsize=5, alpha=0.7,
               color=['red' if i < 5 else 'green' for i in dow_stats['day_of_week']])
plt.xlabel('Day of Week')
plt.ylabel('Average Speed (mph)')
plt.title('Average Speed by Day of Week')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, dow_stats['mean']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{value:.1f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\n=== Weekday vs Weekend ===")
weekday_avg = dow_stats[dow_stats['day_of_week'] < 5]['mean'].mean()
weekend_avg = dow_stats[dow_stats['day_of_week'] >= 5]['mean'].mean()
print(f"Weekday average: {weekday_avg:.1f} mph")
print(f"Weekend average: {weekend_avg:.1f} mph")
print(f"Weekend speed increase: {((weekend_avg - weekday_avg) / weekday_avg * 100):.1f}%")

In [None]:
# Heatmap: Hour vs Day of Week
pivot_data = traffic_data.groupby(['hour', 'day_of_week'])['speed_mph'].mean().reset_index()
heatmap_data = pivot_data.pivot(index='hour', columns='day_of_week', values='speed_mph')

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, 
            annot=True, 
            fmt='.1f', 
            cmap='RdYlGn', 
            cbar_kws={'label': 'Average Speed (mph)'},
            xticklabels=dow_names,
            yticklabels=range(0, 24))
plt.title('Average Speed Heatmap: Hour vs Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Hour of Day')
plt.tight_layout()
plt.show()

## 4. Spatial Analysis

In [None]:
# Segment performance analysis
segment_stats = traffic_data.groupby('segment_id')['speed_mph'].agg([
    'mean', 'std', 'min', 'max', 'count'
]).reset_index()
segment_stats.columns = ['segment_id', 'avg_speed', 'std_speed', 'min_speed', 'max_speed', 'observations']

# Merge with segment geographic data
segment_analysis = segment_stats.merge(segments_data, on='segment_id')

print("=== Segment Performance Statistics ===")
print(segment_analysis.describe())

# Plot segment performance
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Average speed distribution across segments
axes[0,0].hist(segment_analysis['avg_speed'], bins=20, alpha=0.7, color='skyblue')
axes[0,0].set_xlabel('Average Speed (mph)')
axes[0,0].set_ylabel('Number of Segments')
axes[0,0].set_title('Distribution of Average Speeds Across Segments')

# Speed variability
axes[0,1].scatter(segment_analysis['avg_speed'], segment_analysis['std_speed'], alpha=0.6)
axes[0,1].set_xlabel('Average Speed (mph)')
axes[0,1].set_ylabel('Speed Standard Deviation (mph)')
axes[0,1].set_title('Speed vs Variability by Segment')

# Speed range
axes[1,0].scatter(segment_analysis['min_speed'], segment_analysis['max_speed'], alpha=0.6, color='green')
axes[1,0].set_xlabel('Minimum Speed (mph)')
axes[1,0].set_ylabel('Maximum Speed (mph)')
axes[1,0].set_title('Speed Range by Segment')
axes[1,0].plot([0, 60], [0, 60], 'r--', alpha=0.5)  # Reference line

# Observations per segment
axes[1,1].hist(segment_analysis['observations'], bins=20, alpha=0.7, color='orange')
axes[1,1].set_xlabel('Number of Observations')
axes[1,1].set_ylabel('Number of Segments')
axes[1,1].set_title('Data Coverage per Segment')

plt.tight_layout()
plt.show()

# Identify best and worst performing segments
print("\n=== Top 5 Fastest Segments ===")
print(segment_analysis.nlargest(5, 'avg_speed')[['segment_id', 'avg_speed', 'std_speed']])

print("\n=== Top 5 Slowest Segments ===")
print(segment_analysis.nsmallest(5, 'avg_speed')[['segment_id', 'avg_speed', 'std_speed']])

print("\n=== Most Variable Segments ===")
print(segment_analysis.nlargest(5, 'std_speed')[['segment_id', 'avg_speed', 'std_speed']])

## 5. Feature Engineering

In [None]:
# Create additional features
traffic_features = traffic_data.copy()

# Time-based features
traffic_features['is_weekend'] = (traffic_features['day_of_week'] >= 5).astype(int)
traffic_features['is_rush_hour'] = (
    ((traffic_features['hour'] >= 7) & (traffic_features['hour'] <= 9)) |
    ((traffic_features['hour'] >= 17) & (traffic_features['hour'] <= 19))
).astype(int)

# Time of day categories
def time_of_day(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    elif 18 <= hour < 24:
        return 'evening'
    else:
        return 'night'

traffic_features['time_of_day'] = traffic_features['hour'].apply(time_of_day)

# Speed categories
def speed_category(speed):
    if speed < 15:
        return 'congested'
    elif speed < 25:
        return 'slow'
    elif speed < 35:
        return 'moderate'
    else:
        return 'fast'

traffic_features['speed_category'] = traffic_features['speed_mph'].apply(speed_category)

print("=== New Features Created ===")
print(f"Weekend observations: {traffic_features['is_weekend'].sum()}")
print(f"Rush hour observations: {traffic_features['is_rush_hour'].sum()}")
print("\nTime of day distribution:")
print(traffic_features['time_of_day'].value_counts())
print("\nSpeed category distribution:")
print(traffic_features['speed_category'].value_counts())

In [None]:
# Feature importance analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed by weekend
weekend_speeds = [traffic_features[traffic_features['is_weekend'] == 0]['speed_mph'],
                  traffic_features[traffic_features['is_weekend'] == 1]['speed_mph']]
axes[0,0].boxplot(weekend_speeds, labels=['Weekday', 'Weekend'])
axes[0,0].set_ylabel('Speed (mph)')
axes[0,0].set_title('Speed Distribution: Weekday vs Weekend')

# Speed by rush hour
rush_speeds = [traffic_features[traffic_features['is_rush_hour'] == 0]['speed_mph'],
               traffic_features[traffic_features['is_rush_hour'] == 1]['speed_mph']]
axes[0,1].boxplot(rush_speeds, labels=['Non-Rush', 'Rush Hour'])
axes[0,1].set_ylabel('Speed (mph)')
axes[0,1].set_title('Speed Distribution: Rush Hour vs Non-Rush')

# Speed by time of day
time_groups = traffic_features.groupby('time_of_day')['speed_mph'].apply(list)
axes[1,0].boxplot(time_groups.values, labels=time_groups.index)
axes[1,0].set_ylabel('Speed (mph)')
axes[1,0].set_title('Speed by Time of Day')
axes[1,0].tick_params(axis='x', rotation=45)

# Speed category counts
speed_cat_counts = traffic_features['speed_category'].value_counts()
axes[1,1].pie(speed_cat_counts.values, labels=speed_cat_counts.index, autopct='%1.1f%%')
axes[1,1].set_title('Distribution of Speed Categories')

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_features = ['speed_mph', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_rush_hour']
correlation_matrix = traffic_features[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.3f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("=== Key Correlations with Speed ===")
speed_correlations = correlation_matrix['speed_mph'].sort_values(key=abs, ascending=False)
for feature, corr in speed_correlations.items():
    if feature != 'speed_mph':
        print(f"{feature}: {corr:.3f}")

## 7. Time Series Analysis

In [None]:
# Aggregate to hourly averages
hourly_avg = traffic_features.groupby('timestamp')['speed_mph'].mean().reset_index()
hourly_avg = hourly_avg.sort_values('timestamp')

# Plot time series
plt.figure(figsize=(15, 6))
plt.plot(hourly_avg['timestamp'], hourly_avg['speed_mph'], linewidth=1, alpha=0.8)
plt.xlabel('Time')
plt.ylabel('Average Speed (mph)')
plt.title('Traffic Speed Time Series')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)

# Add rolling average
hourly_avg['rolling_24h'] = hourly_avg['speed_mph'].rolling(window=24, center=True).mean()
plt.plot(hourly_avg['timestamp'], hourly_avg['rolling_24h'], 
         color='red', linewidth=2, label='24-hour Rolling Average')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Time series length: {len(hourly_avg)} hours")
print(f"Average speed over time: {hourly_avg['speed_mph'].mean():.2f} mph")
print(f"Speed volatility (std): {hourly_avg['speed_mph'].std():.2f} mph")

## 8. Data Quality and Preprocessing Recommendations

In [None]:
# Outlier detection
Q1 = traffic_features['speed_mph'].quantile(0.25)
Q3 = traffic_features['speed_mph'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = traffic_features[
    (traffic_features['speed_mph'] < lower_bound) | 
    (traffic_features['speed_mph'] > upper_bound)
]

print("=== Data Quality Assessment ===")
print(f"Total records: {len(traffic_features)}")
print(f"Outliers detected: {len(outliers)} ({len(outliers)/len(traffic_features)*100:.1f}%)")
print(f"Speed range: {traffic_features['speed_mph'].min():.1f} - {traffic_features['speed_mph'].max():.1f} mph")
print(f"Outlier bounds: {lower_bound:.1f} - {upper_bound:.1f} mph")

# Check data completeness by segment
segment_completeness = traffic_features.groupby('segment_id').size()
min_observations = segment_completeness.min()
max_observations = segment_completeness.max()

print(f"\n=== Data Completeness ===")
print(f"Observations per segment: {min_observations} - {max_observations}")
print(f"Average observations per segment: {segment_completeness.mean():.1f}")

# Segments with insufficient data
insufficient_data = segment_completeness[segment_completeness < 10]
print(f"Segments with < 10 observations: {len(insufficient_data)}")

print("\n=== Preprocessing Recommendations ===")
print("1. Remove or cap outliers beyond realistic speed limits")
print("2. Consider interpolation for segments with sparse data")
print("3. Create lag features for temporal modeling")
print("4. Normalize features for neural network models")
print("5. Create train/validation/test splits preserving temporal order")

## 9. Summary and Next Steps

In [None]:
# Save processed features for modeling
traffic_features.to_csv('../data/processed/traffic_features_exploratory.csv', index=False)

print("=== Data Exploration Summary ===")
print(f"• Dataset contains {len(traffic_features)} traffic observations")
print(f"• {len(segments_data)} road segments in San Francisco")
print(f"• Clear rush hour patterns with {((off_peak - morning_rush) / off_peak * 100):.1f}% speed reduction")
print(f"• Weekend speeds are {((weekend_avg - weekday_avg) / weekday_avg * 100):.1f}% higher than weekdays")
print(f"• Speed variability ranges from {segment_analysis['std_speed'].min():.1f} to {segment_analysis['std_speed'].max():.1f} mph across segments")

print("\n=== Key Features for Modeling ===")
print("• Temporal: hour, day_of_week, is_weekend, is_rush_hour")
print("• Spatial: segment_id, coordinates")
print("• Historical: lag features, rolling averages")
print("• Target: speed_mph")

print("\n=== Next Steps ===")
print("1. Implement LSTM model for temporal sequence prediction")
print("2. Build GNN model for spatial relationships")
print("3. Create comprehensive evaluation framework")
print("4. Develop real-time prediction API")
print("5. Build interactive visualizations")

print("\nExploration complete! Processed data saved to '../data/processed/traffic_features_exploratory.csv'")