# Statistical Validation
Builds hourly ride-share distributions for Anchors vs. Noise segments to power the Behavioral Peak line chart in Power BI.

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA_DIR = Path("../data/processed")

master_path = DATA_DIR / "fact_trips.csv"
segment_path = DATA_DIR / "station_behavior_segments.csv"
output_path = DATA_DIR / "hourly_validation_metrics.csv"

if not master_path.exists() or not segment_path.exists():
    raise FileNotFoundError("\u274c Required datasets missing. Run pipeline and segmentation first.")

In [3]:
print("Building Hourly Statistical Validation dataset...")

# 1. Load data - only pulling casuals
df = pd.read_csv(master_path, usecols=['started_at', 'member_casual', 'start_station_name'])
df = df[df['member_casual'] == 'casual'].copy()

segments = pd.read_csv(segment_path, usecols=['start_station_name', 'final_status'])

# 2. Extract Hour and Merge
df['hour'] = pd.to_datetime(df['started_at']).dt.hour
merged = df.merge(segments, on="start_station_name", how="inner")

# 3. Filter for comparison (Anchors vs. Noise)
filtered = merged[merged["final_status"].isin(["Confirmed Behavioral Anchor", "Inconsistent / Noise"])]

Building Hourly Statistical Validation dataset...


In [4]:
# 4. Aggregate Hourly Distribution
hourly_dist = filtered.groupby(["final_status", "hour"]).size().reset_index(name="rides")

# Calculate % share of the day per group
hourly_dist["pct_of_daily_rides"] = (
    hourly_dist.groupby("final_status")["rides"]
    .transform(lambda x: (x / x.sum()) * 100)
)

# 5. Save Output
hourly_dist.to_csv(output_path, index=False)

print("-" * 50)
print(f"\u2705 SUCCESS: Validation metrics saved to {output_path}")
print("This file will power your 'Behavioral Peak' Line Chart in Power BI.")
hourly_dist.head(10)

--------------------------------------------------
âœ… SUCCESS: Validation metrics saved to ..\data\processed\hourly_validation_metrics.csv
This file will power your 'Behavioral Peak' Line Chart in Power BI.


Unnamed: 0,final_status,hour,rides,pct_of_daily_rides
0,Confirmed Behavioral Anchor,0,37,0.49778
1,Confirmed Behavioral Anchor,1,20,0.26907
2,Confirmed Behavioral Anchor,2,12,0.161442
3,Confirmed Behavioral Anchor,3,10,0.134535
4,Confirmed Behavioral Anchor,4,10,0.134535
5,Confirmed Behavioral Anchor,5,44,0.591955
6,Confirmed Behavioral Anchor,6,199,2.67725
7,Confirmed Behavioral Anchor,7,455,6.121351
8,Confirmed Behavioral Anchor,8,713,9.592358
9,Confirmed Behavioral Anchor,9,472,6.350061
