In [None]:
# Setup and Imports
import sys
sys.path.append('../04_Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import custom utilities
from utils import setup_logger, load_config
from data_loader import load_csv_data, check_data_quality

# Configure
logger = setup_logger('data_exploration')
config = load_config('../08_Configuration/config.yaml')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

logger.info("✅ Setup complete!")
print("Notebook initialized successfully!")

## 1. Bangkok Traffic Congestion Index

**Source:** CEIC Data / TrafficIndex.org

**Expected:** ~1,682 observations (2019-2025)

In [None]:
# Load Bangkok Traffic data
traffic_file = '../02_Data/Raw/bangkok_traffic_2019_2025.csv'

# Check if file exists
if Path(traffic_file).exists():
    df_traffic = load_csv_data(traffic_file)
    logger.info(f"✅ Loaded Bangkok Traffic data: {len(df_traffic)} rows")
else:
    logger.warning("⚠️ Bangkok Traffic file not found. Please download data.")
    df_traffic = None

In [None]:
# Explore structure
if df_traffic is not None:
    print("=" * 60)
    print("BANGKOK TRAFFIC DATASET")
    print("=" * 60)
    
    print(f"\nShape: {df_traffic.shape}")
    print(f"Rows: {df_traffic.shape[0]:,}")
    print(f"Columns: {df_traffic.shape[1]}")
    
    print("\nColumn Names and Types:")
    print(df_traffic.dtypes)
    
    print("\nFirst 5 rows:")
    display(df_traffic.head())
    
    print("\nBasic Statistics:")
    display(df_traffic.describe())
    
    print("\nMissing Values:")
    missing = df_traffic.isnull().sum()
    missing_pct = (missing / len(df_traffic)) * 100
    missing_df = pd.DataFrame({
        'Missing': missing,
        'Percentage': missing_pct
    })
    display(missing_df[missing_df['Missing'] > 0])

In [None]:
# Quality check
if df_traffic is not None:
    quality_report = check_data_quality(
        df_traffic,
        required_columns=['date', 'congestion_index'],
        max_missing_pct=10.0
    )
    
    print("\n" + "=" * 60)
    print("QUALITY REPORT")
    print("=" * 60)
    for key, value in quality_report.items():
        print(f"{key}: {value}")

## 2. US Accidents Dataset (Reference)

**Source:** Kaggle - Sobhan Moosavi et al.

**Expected:** 2.8M+ records

In [None]:
# Load US Accidents data
accidents_file = '../02_Data/Raw/us_accidents.csv'

if Path(accidents_file).exists():
    # Load sample first (large file)
    df_accidents = pd.read_csv(accidents_file, nrows=100000)
    logger.info(f"✅ Loaded US Accidents sample: {len(df_accidents)} rows")
else:
    logger.warning("⚠️ US Accidents file not found.")
    df_accidents = None

In [None]:
# Explore structure
if df_accidents is not None:
    print("=" * 60)
    print("US ACCIDENTS DATASET (SAMPLE)")
    print("=" * 60)
    
    print(f"\nShape: {df_accidents.shape}")
    print(f"\nColumns ({len(df_accidents.columns)}):")
    print(df_accidents.columns.tolist())
    
    print("\nFirst 3 rows:")
    display(df_accidents.head(3))
    
    print("\nSeverity Distribution:")
    print(df_accidents['Severity'].value_counts())
    
    print("\nMissing Values (Top 10):")
    missing = df_accidents.isnull().sum()
    missing_pct = (missing / len(df_accidents)) * 100
    missing_df = pd.DataFrame({
        'Missing': missing,
        'Percentage': missing_pct
    }).sort_values('Missing', ascending=False)
    display(missing_df.head(10))

## 3. Weather Data

**Source:** NOAA / NASA APIs

**Expected:** Daily data (2019-2025)

In [None]:
# Load Weather data
weather_file = '../02_Data/Raw/bangkok_weather.csv'

if Path(weather_file).exists():
    df_weather = load_csv_data(weather_file)
    logger.info(f"✅ Loaded Weather data: {len(df_weather)} rows")
else:
    logger.warning("⚠️ Weather file not found.")
    df_weather = None

In [None]:
# Explore weather data
if df_weather is not None:
    print("=" * 60)
    print("WEATHER DATASET")
    print("=" * 60)
    
    print(f"\nShape: {df_weather.shape}")
    display(df_weather.head())
    display(df_weather.describe())
    
    # Temperature range validation (Bangkok: 15-42°C typical)
    if 'temp_avg' in df_weather.columns:
        temp_min = df_weather['temp_avg'].min()
        temp_max = df_weather['temp_avg'].max()
        print(f"\nTemperature Range: {temp_min:.1f}°C to {temp_max:.1f}°C")
        if temp_min < 10 or temp_max > 45:
            print("⚠️ WARNING: Temperature outside expected Bangkok range!")

## 4. OpenStreetMap Road Network

**Source:** OpenStreetMap

**Expected:** GeoJSON/Shapefile for Bangkok

In [None]:
# Check for OSM data
import geopandas as gpd

osm_file = '../02_Data/Raw/bangkok_osm_roads.geojson'

if Path(osm_file).exists():
    df_osm = gpd.read_file(osm_file)
    logger.info(f"✅ Loaded OSM data: {len(df_osm)} features")
else:
    logger.warning("⚠️ OSM file not found.")
    df_osm = None

In [None]:
# Explore OSM data
if df_osm is not None:
    print("=" * 60)
    print("OPENSTREETMAP ROAD NETWORK")
    print("=" * 60)
    
    print(f"\nTotal Features: {len(df_osm)}")
    print(f"\nColumns: {df_osm.columns.tolist()}")
    
    if 'highway' in df_osm.columns:
        print("\nRoad Types:")
        print(df_osm['highway'].value_counts())
    
    print("\nFirst 3 features:")
    display(df_osm.head(3))
    
    # Quick map
    print("\nQuick visualization:")
    df_osm.plot(figsize=(10, 10))
    plt.title('Bangkok Road Network (OSM)')
    plt.show()

## 5. Public Transit Ridership

**Source:** BMA/BTS/MRT (pending) or reference datasets

**Expected:** Station-level ridership data

In [None]:
# Load Transit data
transit_file = '../02_Data/Raw/transit_ridership.csv'

if Path(transit_file).exists():
    df_transit = load_csv_data(transit_file)
    logger.info(f"✅ Loaded Transit data: {len(df_transit)} rows")
else:
    logger.warning("⚠️ Transit file not found. Using reference data if available.")
    df_transit = None

In [None]:
# Explore transit data
if df_transit is not None:
    print("=" * 60)
    print("TRANSIT RIDERSHIP DATASET")
    print("=" * 60)
    
    print(f"\nShape: {df_transit.shape}")
    display(df_transit.head())
    display(df_transit.describe())

## Summary of Data Exploration

### Datasets Status

Fill in after running above cells:

| Dataset | Status | Rows | Columns | Quality Issues |
|---------|--------|------|---------|----------------|
| Bangkok Traffic | ☐ Loaded / ☐ Missing | - | - | - |
| US Accidents | ☐ Loaded / ☐ Missing | - | - | - |
| Weather | ☐ Loaded / ☐ Missing | - | - | - |
| OSM Roads | ☐ Loaded / ☐ Missing | - | - | - |
| Transit | ☐ Loaded / ☐ Missing | - | - | - |

### Key Findings

1. **Data Availability:**
   - [ ] All 5 datasets acquired
   - [ ] Missing datasets identified

2. **Data Quality Issues:**
   - Missing values: 
   - Outliers detected: 
   - Data type issues: 

3. **Next Steps:**
   - [ ] Download missing datasets
   - [ ] Proceed to data cleaning (Notebook 02)
   - [ ] Document data quality issues

---

## Next Notebook

→ **02_Data_Cleaning.ipynb** - Clean and prepare data for analysis

In [None]:
# Save exploration summary
print("\n" + "=" * 60)
print("DATA EXPLORATION COMPLETE")
print("=" * 60)
print("\nNext steps:")
print("1. Review findings above")
print("2. Download any missing datasets")
print("3. Proceed to 02_Data_Cleaning.ipynb")
print("\nDocumentation: Update PROJECT_STATUS.md with findings")