# Data Verification: VCC vs Local Storage

This notebook verifies what intersections are available from VCC and compares with what we have stored locally.

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime, date, timedelta
from pathlib import Path

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Check VCC API - Available Intersections

In [None]:
# VCC API Configuration
BASE_URL = "https://vcc.vtti.vt.edu"
TOKEN_URL = f"{BASE_URL}/api/auth/client"
CLIENT_ID = 'course-cs6604-student-djjay'
CLIENT_SECRET = 'wHqQjvksKE6rYLYedkuIqewrFtEOpjHH'

# Get access token
def get_access_token():
    data = {
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET
    }
    
    response = requests.post(TOKEN_URL, data=data, allow_redirects=False)
    response.raise_for_status()
    token_data = response.json()
    access_token = token_data.get('access_token')
    expires_in = token_data.get('expires_in')
    
    print(f"‚úì Access token obtained (expires in {expires_in/60:.1f} minutes)")
    return access_token

access_token = get_access_token()
headers = {'Authorization': f'Bearer {access_token}'}

In [None]:
# Get all MapData from VCC
def get_all_mapdata():
    url = f"{BASE_URL}/api/mapdata/decoded"
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    data = response.json()
    print(f"‚úì Retrieved {len(data)} MapData messages from VCC")
    return data

mapdata_list = get_all_mapdata()

# Parse intersections
vcc_intersections = []
for md in mapdata_list:
    if 'intersections' in md and len(md['intersections']) > 0:
        intersection = md['intersections'][0]
        vcc_intersections.append({
            'Intersection ID': intersection['id']['id'],
            'Revision': intersection['revision'],
            'Latitude': intersection['refPoint']['lat'],
            'Longitude': intersection['refPoint']['lon'],
            'Lane Count': len(intersection.get('laneSet', [])),
            'Lane Width (m)': intersection.get('laneWidth')
        })

df_vcc = pd.DataFrame(vcc_intersections)
print(f"\nüìç VCC Intersections:")
display(df_vcc)

## 2. Check Our API - Currently Tracked Intersections

In [None]:
# Query our local API
LOCAL_API_URL = "http://localhost:8001/api/v1/safety/index/"

response = requests.get(LOCAL_API_URL)
response.raise_for_status()
local_data = response.json()

print(f"‚úì Retrieved {len(local_data)} intersections from local API\n")

# Parse local intersections
local_intersections = []
for item in local_data:
    local_intersections.append({
        'Intersection ID': item['intersection_id'],
        'Name': item['intersection_name'],
        'Safety Index': item['safety_index'],
        'Traffic Volume': item['traffic_volume'],
        'Latitude': item['latitude'],
        'Longitude': item['longitude']
    })

df_local = pd.DataFrame(local_intersections)
print(f"üìç Local API Intersections:")
display(df_local)

## 3. Check Parquet Storage - Historical Data Files

In [None]:
# Check parquet storage
PARQUET_BASE = Path("../backend/data/parquet")
indices_path = PARQUET_BASE / "indices"
raw_path = PARQUET_BASE / "raw"

print(f"üìÅ Parquet Storage Paths:")
print(f"  Base: {PARQUET_BASE.absolute()}")
print(f"  Indices: {indices_path.absolute()}")
print(f"  Raw: {raw_path.absolute()}")

# Check if directories exist
print(f"\nüìä Directory Status:")
print(f"  Base exists: {PARQUET_BASE.exists()}")
print(f"  Indices exists: {indices_path.exists()}")
print(f"  Raw exists: {raw_path.exists()}")

# List indices files
if indices_path.exists():
    indices_files = sorted(indices_path.glob("indices_*.parquet"))
    print(f"\nüìÑ Indices Files ({len(indices_files)} total):")
    for f in indices_files[:10]:  # Show first 10
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  - {f.name} ({size_mb:.2f} MB)")
    if len(indices_files) > 10:
        print(f"  ... and {len(indices_files) - 10} more files")
else:
    print(f"\n‚ö†Ô∏è  No indices directory found!")

# List raw BSM files
if raw_path.exists():
    bsm_path = raw_path / "bsm"
    if bsm_path.exists():
        bsm_files = sorted(bsm_path.glob("bsm_*.parquet"))
        print(f"\nüìÑ BSM Files ({len(bsm_files)} total):")
        for f in bsm_files[:5]:  # Show first 5
            size_mb = f.stat().st_size / (1024 * 1024)
            print(f"  - {f.name} ({size_mb:.2f} MB)")
        if len(bsm_files) > 5:
            print(f"  ... and {len(bsm_files) - 5} more files")
else:
    print(f"\n‚ö†Ô∏è  No raw data directory found!")

## 4. Analyze Historical Data (if available)

In [None]:
# Read most recent indices file if it exists
if indices_path.exists():
    indices_files = sorted(indices_path.glob("indices_*.parquet"))
    if indices_files:
        latest_file = indices_files[-1]
        print(f"üìñ Reading latest indices file: {latest_file.name}\n")
        
        df_indices = pd.read_parquet(latest_file)
        
        print(f"Shape: {df_indices.shape}")
        print(f"Columns: {list(df_indices.columns)}")
        print(f"\nData types:")
        print(df_indices.dtypes)
        
        # Check unique intersections
        if 'intersection' in df_indices.columns:
            unique_intersections = df_indices['intersection'].unique()
            print(f"\nüîç Unique intersections in data: {len(unique_intersections)}")
            print(f"  Values: {sorted(unique_intersections)}")
            
            # Count records per intersection
            intersection_counts = df_indices['intersection'].value_counts()
            print(f"\nüìà Records per intersection:")
            display(intersection_counts)
        
        # Show sample data
        print(f"\nüìã Sample data (first 5 rows):")
        display(df_indices.head())
    else:
        print(f"‚ö†Ô∏è  No indices files found in {indices_path}")
else:
    print(f"‚ö†Ô∏è  Indices directory not found!")

## 5. Summary and Comparison

In [None]:
print("\n" + "=" * 60)
print("DATA VERIFICATION SUMMARY")
print("=" * 60)

print(f"\nüì° VCC API:")
print(f"  Available Intersections: {len(df_vcc)}")
print(f"  Intersection IDs: {sorted(df_vcc['Intersection ID'].tolist())}")

print(f"\nüè† Local API:")
print(f"  Tracked Intersections: {len(df_local)}")
if not df_local.empty:
    print(f"  Intersection IDs: {sorted(df_local['Intersection ID'].tolist())}")

print(f"\nüíæ Parquet Storage:")
if indices_path.exists():
    indices_files = sorted(indices_path.glob("indices_*.parquet"))
    print(f"  Historical Files: {len(indices_files)}")
    if indices_files:
        # Read latest to check intersections
        df_latest = pd.read_parquet(indices_files[-1])
        if 'intersection' in df_latest.columns:
            stored_intersections = df_latest['intersection'].unique()
            print(f"  Stored Intersections: {len(stored_intersections)}")
            print(f"  Intersection IDs: {sorted(stored_intersections.tolist())}")
else:
    print(f"  Status: No storage directory found")

print(f"\nüîç Analysis:")
if len(df_vcc) > len(df_local):
    missing_count = len(df_vcc) - len(df_local)
    print(f"  ‚ö†Ô∏è  Missing {missing_count} intersection(s) from local tracking")
    print(f"  ‚ÑπÔ∏è  VCC has {len(df_vcc)} intersections, but we're only tracking {len(df_local)}")
elif len(df_vcc) == len(df_local):
    print(f"  ‚úì All VCC intersections are being tracked")
else:
    print(f"  ‚ö†Ô∏è  Local API has more intersections than VCC!")

print("\n" + "=" * 60)

## 6. Check Data Collector Status

In [None]:
import subprocess
import json as json_module

# Check data collector container logs
print("üìã Data Collector Logs (last 50 lines):\n")
result = subprocess.run(
    ['docker', 'logs', 'trafficsafety-collector', '--tail', '50'],
    capture_output=True,
    text=True
)
print(result.stdout)
if result.stderr:
    print("Errors:")
    print(result.stderr)