# Habitual Analysis
Scores casual-rider stations by rush-hour consistency (Ch) and mid-week focus (Cd) to identify habitual commuter-like behaviour.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Config
DATA_DIR = Path("../data/processed")
RUSH_WINDOW = [7, 8, 9, 17, 18, 19]

In [3]:
input_path = DATA_DIR / "fact_trips.csv"
if not input_path.exists():
    raise FileNotFoundError("\u274c fact_trips.csv not found. Run pipeline first.")

# Load only what we need. 'is_commute' was already calculated in the pipeline.
df = pd.read_csv(input_path, usecols=['start_station_name', 'started_at', 'member_casual', 'is_commute'])

# Filter for Casuals and fix types
df = df[df['member_casual'] == 'casual'].copy()
df['started_at'] = pd.to_datetime(df['started_at'])
df['month'] = df['started_at'].dt.month_name()
df['hour'] = df['started_at'].dt.hour
df['day_name'] = df['started_at'].dt.day_name()

print(f"Analyzing habitual patterns for {len(df):,} casual rides...")

Analyzing habitual patterns for 1,568,655 casual rides...


In [4]:
# 1. Volume Check: Only stations with enough data to be statistically relevant
station_monthly_vol = df.groupby(['start_station_name', 'month']).size().reset_index(name='vol')
vol_threshold = station_monthly_vol['vol'].quantile(0.75)  # Focus on top 25% active stations
valid_stations = station_monthly_vol[station_monthly_vol['vol'] >= vol_threshold]
print(f"Volume threshold (75th percentile): {vol_threshold}")
print(f"Station-month combos passing threshold: {len(valid_stations)}")

Volume threshold (75th percentile): 103.0
Station-month combos passing threshold: 3489


In [5]:
# 2. Hourly Consistency (Ch)
df['in_rush'] = df['hour'].isin(RUSH_WINDOW).astype(int)
ch_scores = df.groupby(['start_station_name', 'month'])['in_rush'].mean().reset_index(name='Ch')

# 3. Mid-week Focus (Cd)
midweek_days = ['Tuesday', 'Wednesday', 'Thursday']
df['is_midweek'] = df['day_name'].isin(midweek_days).astype(int)
cd_scores = df.groupby(['start_station_name', 'month'])['is_midweek'].mean().reset_index(name='Cd')

In [6]:
# 4. Final Scoring Logic: 60% Rush Hour + 40% Midweek
results = valid_stations.merge(ch_scores, on=['start_station_name', 'month'])
results = results.merge(cd_scores, on=['start_station_name', 'month'])
results['routine_score'] = (results['Ch'] * 0.6) + (results['Cd'] * 0.4)

# Tiering for Power BI filters
results['tier'] = pd.cut(
    results['routine_score'],
    bins=[0, 0.25, 0.45, 1.0],
    labels=['Low', 'Emerging', 'Strong']
)

output_path = DATA_DIR / "habitual_metrics.csv"
results.sort_values('routine_score', ascending=False).to_csv(output_path, index=False)
print(f"\u2705 SUCCESS: Habitual metrics saved to {output_path}")
results.head(10)

âœ… SUCCESS: Habitual metrics saved to ..\data\processed\habitual_metrics.csv


Unnamed: 0,start_station_name,month,vol,Ch,Cd,routine_score,tier
0,63rd St Beach,August,165,0.272727,0.248485,0.26303,Emerging
1,63rd St Beach,July,146,0.30137,0.363014,0.326027,Emerging
2,63rd St Beach,June,138,0.347826,0.26087,0.313043,Emerging
3,63rd St Beach,September,152,0.453947,0.184211,0.346053,Emerging
4,900 W Harrison St,April,227,0.312775,0.45815,0.370925,Emerging
5,900 W Harrison St,August,340,0.326471,0.388235,0.351176,Emerging
6,900 W Harrison St,December,150,0.233333,0.413333,0.305333,Emerging
7,900 W Harrison St,February,112,0.232143,0.535714,0.353571,Emerging
8,900 W Harrison St,July,238,0.306723,0.529412,0.395798,Emerging
9,900 W Harrison St,June,201,0.313433,0.412935,0.353234,Emerging
