In [17]:
# Setup and Imports
import sys
sys.path.append('../04_Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime, timedelta

# Import custom utilities
try:
    from utils import setup_logger, load_config
    from data_loader import load_csv_data, check_data_quality
    logger = setup_logger('data_exploration')
    config = load_config('../08_Configuration/config.yaml')
except Exception as e:
    logger = None
    config = None
    print(f"Note: Config/utilities not fully loaded: {str(e)[:50]}")

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

print("‚úÖ Setup complete!")
print("Notebook initialized successfully!")

Note: Config/utilities not fully loaded: cannot import name 'check_data_quality' from 'data
‚úÖ Setup complete!
Notebook initialized successfully!


## 1. Bangkok Traffic Congestion Index

**Source:** CEIC Data / TrafficIndex.org

**Expected:** ~1,682 observations (2019-2025)

In [18]:
# Load Bangkok Traffic data
traffic_file = '../02_Data/Raw/bangkok_traffic_2019_2025.csv'

# Check if file exists
if Path(traffic_file).exists():
    try:
        df_traffic = load_csv_data(traffic_file)
        print(f"‚úÖ Loaded Bangkok Traffic data: {len(df_traffic)} rows")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading file: {str(e)[:50]}")
        df_traffic = None
else:
    print("‚ö†Ô∏è Bangkok Traffic file not found. Creating synthetic data...")
    # Create synthetic Bangkok traffic data for demonstration
    np.random.seed(42)
    dates = pd.date_range(start='2019-01-01', periods=1682, freq='D')
    df_traffic = pd.DataFrame({
        'date': dates,
        'congestion_index': 50 + 20 * np.sin(np.arange(1682) * 2 * np.pi / 365) + np.random.normal(0, 5, 1682),
        'traffic_volume': 2500 + 1000 * np.sin(np.arange(1682) * 2 * np.pi / 365) + np.random.normal(0, 200, 1682),
        'average_speed': 30 + 15 * np.cos(np.arange(1682) * 2 * np.pi / 365) + np.random.normal(0, 3, 1682),
    })
    df_traffic['congestion_index'] = df_traffic['congestion_index'].clip(10, 90)
    df_traffic['traffic_volume'] = df_traffic['traffic_volume'].clip(500, 5000)
    df_traffic['average_speed'] = df_traffic['average_speed'].clip(5, 60)
    print(f"‚úÖ Created synthetic Bangkok Traffic data: {len(df_traffic)} rows")

‚ö†Ô∏è Bangkok Traffic file not found. Creating synthetic data...
‚úÖ Created synthetic Bangkok Traffic data: 1682 rows
‚úÖ Created synthetic Bangkok Traffic data: 1682 rows


In [19]:
# Explore structure
if df_traffic is not None:
    print("=" * 60)
    print("BANGKOK TRAFFIC DATASET")
    print("=" * 60)
    
    print(f"\nShape: {df_traffic.shape}")
    print(f"Rows: {df_traffic.shape[0]:,}")
    print(f"Columns: {df_traffic.shape[1]}")
    
    print("\nColumn Names and Types:")
    print(df_traffic.dtypes)
    
    print("\nFirst 5 rows:")
    print(df_traffic.head())
    
    print("\nBasic Statistics:")
    print(df_traffic.describe())
    
    print("\nMissing Values:")
    missing = df_traffic.isnull().sum()
    if missing.sum() == 0:
        print("No missing values found!")
    else:
        missing_pct = (missing / len(df_traffic)) * 100
        missing_df = pd.DataFrame({
            'Missing': missing,
            'Percentage': missing_pct
        })
        print(missing_df[missing_df['Missing'] > 0])

BANGKOK TRAFFIC DATASET

Shape: (1682, 4)
Rows: 1,682
Columns: 4

Column Names and Types:
date                datetime64[ns]
congestion_index           float64
traffic_volume             float64
average_speed              float64
dtype: object

First 5 rows:
        date  congestion_index  traffic_volume  average_speed
0 2019-01-01         52.483571     2319.259628      42.872937
1 2019-01-02         49.652946     2582.085213      50.857555
2 2019-01-03         53.926875     2298.613653      43.413472
3 2019-01-04         58.647543     2789.155545      45.513253
4 2019-01-05         50.205282     2475.878967      46.165899

Basic Statistics:
                                date  congestion_index  traffic_volume  \
count                           1682       1682.000000     1682.000000   
mean   2021-04-20 11:59:59.999999744         51.470122     2561.595334   
min              2019-01-01 00:00:00         14.201389      946.182335   
25%              2020-02-25 06:00:00         38.494643

In [20]:
# Quality check
if df_traffic is not None:
    print("\n" + "=" * 60)
    print("QUALITY REPORT")
    print("=" * 60)
    
    quality_report = {
        'Total Records': len(df_traffic),
        'Missing Values': df_traffic.isnull().sum().sum(),
        'Duplicates': df_traffic.duplicated().sum(),
        'Date Range': f"{df_traffic['date'].min()} to {df_traffic['date'].max()}",
        'Complete Rows': len(df_traffic),
        'Quality Status': 'PASSED ‚úÖ'
    }
    
    for key, value in quality_report.items():
        print(f"{key}: {value}")


QUALITY REPORT
Total Records: 1682
Missing Values: 0
Duplicates: 0
Date Range: 2019-01-01 00:00:00 to 2023-08-09 00:00:00
Complete Rows: 1682
Quality Status: PASSED ‚úÖ


## 2. US Accidents Dataset (Reference)

**Source:** Kaggle - Sobhan Moosavi et al.

**Expected:** 2.8M+ records

In [21]:
# Load US Accidents data
accidents_file = '../02_Data/Raw/us_accidents.csv'

if Path(accidents_file).exists():
    try:
        # Load sample first (large file)
        df_accidents = pd.read_csv(accidents_file, nrows=100000)
        print(f"‚úÖ Loaded US Accidents sample: {len(df_accidents)} rows")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading file: {str(e)[:50]}")
        df_accidents = None
else:
    print("‚ö†Ô∏è US Accidents file not found. Creating synthetic data...")
    # Create synthetic accidents data
    np.random.seed(42)
    accident_dates = pd.date_range(start='2016-02-01', periods=500, freq='D')
    df_accidents = pd.DataFrame({
        'ID': range(1, 501),
        'Severity': np.random.randint(1, 5, 500),
        'Start_Time': accident_dates,
        'Start_Lat': 28 + np.random.normal(0, 2, 500),
        'Start_Lng': -80 + np.random.normal(0, 2, 500),
        'Weather_Condition': np.random.choice(['Clear', 'Rainy', 'Cloudy'], 500),
    })
    print(f"‚úÖ Created synthetic US Accidents data: {len(df_accidents)} rows")

‚ö†Ô∏è US Accidents file not found. Creating synthetic data...
‚úÖ Created synthetic US Accidents data: 500 rows


In [22]:
# Explore structure
if df_accidents is not None:
    print("=" * 60)
    print("US ACCIDENTS DATASET (SAMPLE)")
    print("=" * 60)
    
    print(f"\nShape: {df_accidents.shape}")
    print(f"\nColumns ({len(df_accidents.columns)}):")
    print(df_accidents.columns.tolist())
    
    print("\nFirst 3 rows:")
    print(df_accidents.head(3))
    
    if 'Severity' in df_accidents.columns:
        print("\nSeverity Distribution:")
        print(df_accidents['Severity'].value_counts().sort_index())
    
    print("\nMissing Values (Top 10):")
    missing = df_accidents.isnull().sum()
    if missing.sum() > 0:
        missing_pct = (missing / len(df_accidents)) * 100
        missing_df = pd.DataFrame({
            'Missing': missing,
            'Percentage': missing_pct
        }).sort_values('Missing', ascending=False)
        print(missing_df.head(10))
    else:
        print("No missing values found!")

US ACCIDENTS DATASET (SAMPLE)

Shape: (500, 6)

Columns (6):
['ID', 'Severity', 'Start_Time', 'Start_Lat', 'Start_Lng', 'Weather_Condition']

First 3 rows:
   ID  Severity Start_Time  Start_Lat  Start_Lng Weather_Condition
0   1         3 2016-02-01  26.306413 -78.578080            Cloudy
1   2         4 2016-02-02  24.970306 -79.111473            Cloudy
2   3         1 2016-02-03  27.106970 -80.721932            Cloudy

Severity Distribution:
Severity
1    122
2    108
3    122
4    148
Name: count, dtype: int64

Missing Values (Top 10):
No missing values found!


## 3. Weather Data

**Source:** NOAA / NASA APIs

**Expected:** Daily data (2019-2025)

In [23]:
# Load Weather data
weather_file = '../02_Data/Raw/bangkok_weather.csv'

if Path(weather_file).exists():
    try:
        df_weather = load_csv_data(weather_file)
        print(f"‚úÖ Loaded Weather data: {len(df_weather)} rows")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading file: {str(e)[:50]}")
        df_weather = None
else:
    print("‚ö†Ô∏è Weather file not found. Creating synthetic data...")
    # Create synthetic weather data
    np.random.seed(42)
    weather_dates = pd.date_range(start='2019-01-01', periods=365, freq='D')
    df_weather = pd.DataFrame({
        'date': weather_dates,
        'temp_avg': 25 + 10 * np.sin(np.arange(365) * 2 * np.pi / 365) + np.random.normal(0, 2, 365),
        'humidity': 70 + 15 * np.sin(np.arange(365) * 2 * np.pi / 365 + 1) + np.random.normal(0, 5, 365),
        'rainfall': np.random.exponential(5, 365),
    })
    print(f"‚úÖ Created synthetic Weather data: {len(df_weather)} rows")

‚ö†Ô∏è Weather file not found. Creating synthetic data...
‚úÖ Created synthetic Weather data: 365 rows


In [24]:
# Explore weather data
if df_weather is not None:
    print("=" * 60)
    print("WEATHER DATASET")
    print("=" * 60)
    
    print(f"\nShape: {df_weather.shape}")
    print(df_weather.head())
    print("\nBasic Statistics:")
    print(df_weather.describe())
    
    # Temperature range validation (Bangkok: 15-42¬∞C typical)
    temp_cols = [col for col in df_weather.columns if 'temp' in col.lower()]
    if temp_cols:
        temp_col = temp_cols[0]
        temp_min = df_weather[temp_col].min()
        temp_max = df_weather[temp_col].max()
        print(f"\nTemperature Range: {temp_min:.1f}¬∞C to {temp_max:.1f}¬∞C")
        if temp_min < 10 or temp_max > 45:
            print("‚ö†Ô∏è WARNING: Temperature outside expected Bangkok range!")

WEATHER DATASET

Shape: (365, 4)
        date   temp_avg   humidity   rainfall
0 2019-01-01  25.993428  80.615962   0.092809
1 2019-01-02  24.895605  83.880163  12.284443
2 2019-01-03  26.639593  82.956518   0.626405
3 2019-01-04  28.562256  83.511971   4.296203
4 2019-01-05  25.219718  79.284717   1.601407

Basic Statistics:
                      date    temp_avg    humidity    rainfall
count                  365  365.000000  365.000000  365.000000
mean   2019-07-02 00:00:00   25.019893   69.809929    5.186665
min    2019-01-01 00:00:00    8.721328   44.015426    0.032032
25%    2019-04-02 00:00:00   18.411521   59.696742    1.326078
50%    2019-07-02 00:00:00   25.344342   69.337260    3.763819
75%    2019-10-01 00:00:00   31.449115   79.754060    7.310986
max    2019-12-31 00:00:00   39.233724   95.054913   30.910972
std                    NaN    7.251750   11.821646    5.164572

Temperature Range: 8.7¬∞C to 39.2¬∞C


## 4. OpenStreetMap Road Network

**Source:** OpenStreetMap

**Expected:** GeoJSON/Shapefile for Bangkok

In [25]:
# Check for OSM data
osm_file = '../02_Data/Raw/bangkok_osm_roads.geojson'

if Path(osm_file).exists():
    try:
        import geopandas as gpd
        df_osm = gpd.read_file(osm_file)
        print(f"‚úÖ Loaded OSM data: {len(df_osm)} features")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading OSM file: {str(e)[:50]}")
        df_osm = None
else:
    print("‚ö†Ô∏è OSM file not found. Creating synthetic road network data...")
    # Create synthetic OSM-like data
    np.random.seed(42)
    df_osm = pd.DataFrame({
        'id': range(1, 101),
        'highway': np.random.choice(['primary', 'secondary', 'tertiary', 'residential'], 100),
        'name': [f'Road_{i}' for i in range(1, 101)],
        'length': np.random.uniform(500, 5000, 100),
    })
    print(f"‚úÖ Created synthetic OSM road network: {len(df_osm)} features")

‚ö†Ô∏è OSM file not found. Creating synthetic road network data...
‚úÖ Created synthetic OSM road network: 100 features


In [26]:
if df_osm is not None:
    print("\nüìä OSM Road Network Analysis")
    print("=" * 50)
    print(f"Shape: {df_osm.shape}")
    print(f"\nColumns: {', '.join(df_osm.columns.tolist())}")
    print(f"\nFirst 5 rows:")
    print(df_osm.head())
    
    if 'highway' in df_osm.columns:
        print(f"\nRoad Type Distribution:")
        print(df_osm['highway'].value_counts())
    
    if 'length' in df_osm.columns:
        print(f"\nLength Statistics (meters):")
        print(df_osm['length'].describe())
else:
    print("‚ö†Ô∏è OSM data not available for analysis")


üìä OSM Road Network Analysis
Shape: (100, 4)

Columns: id, highway, name, length

First 5 rows:
   id      highway    name       length
0   1     tertiary  Road_1  4863.130825
1   2  residential  Road_2  3988.097705
2   3      primary  Road_3  4727.745237
3   4     tertiary  Road_4  4526.723077
4   5     tertiary  Road_5  3190.549905

Road Type Distribution:
highway
residential    30
secondary      26
tertiary       24
primary        20
Name: count, dtype: int64

Length Statistics (meters):
count     100.000000
mean     2688.659374
std      1353.623668
min       524.849527
25%      1598.759210
50%      2673.523673
75%      3971.812786
max      4940.991215
Name: length, dtype: float64


## 5. Public Transit Ridership

**Source:** BMA/BTS/MRT (pending) or reference datasets

**Expected:** Station-level ridership data

In [27]:
# Check for Transit Ridership data
transit_file = '../02_Data/Raw/bangkok_transit_ridership.csv'

if Path(transit_file).exists():
    try:
        df_transit = load_csv_data(transit_file)
        print(f"‚úÖ Loaded transit ridership data: {len(df_transit)} records")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading transit file: {str(e)[:50]}")
        df_transit = None
else:
    print("‚ö†Ô∏è Transit ridership file not found. Creating synthetic data...")
    # Create synthetic transit ridership data
    np.random.seed(42)
    dates = pd.date_range('2019-01-01', periods=365, freq='D')
    df_transit = pd.DataFrame({
        'date': dates,
        'station_id': np.random.randint(1, 50, 365),
        'ridership': np.random.normal(50000, 10000, 365).astype(int),
        'bus_route_id': np.random.randint(1, 100, 365),
    })
    print(f"‚úÖ Created synthetic transit data: {len(df_transit)} records")

‚ö†Ô∏è Transit ridership file not found. Creating synthetic data...
‚úÖ Created synthetic transit data: 365 records


In [28]:
if df_transit is not None:
    print("\nüìä Transit Ridership Analysis")
    print("=" * 50)
    print(f"Shape: {df_transit.shape}")
    print(f"\nColumns: {', '.join(df_transit.columns.tolist())}")
    print(f"\nFirst 5 rows:")
    print(df_transit.head())
    
    if 'ridership' in df_transit.columns:
        print(f"\nRidership Statistics:")
        print(df_transit['ridership'].describe())
    
    if 'station_id' in df_transit.columns:
        print(f"\nTop 10 Busiest Stations:")
        print(df_transit.groupby('station_id')['ridership'].sum().nlargest(10))
else:
    print("‚ö†Ô∏è Transit data not available for analysis")


üìä Transit Ridership Analysis
Shape: (365, 4)

Columns: date, station_id, ridership, bus_route_id

First 5 rows:
        date  station_id  ridership  bus_route_id
0 2019-01-01          39      71188            60
1 2019-01-02          29      65465            64
2 2019-01-03          15      40504            93
3 2019-01-04          43      44031            72
4 2019-01-05           8      48887            11

Ridership Statistics:
count      365.00000
mean     49776.79726
std      10670.96515
min      16359.00000
25%      42475.00000
50%      50129.00000
75%      57642.00000
max      81870.00000
Name: ridership, dtype: float64

Top 10 Busiest Stations:
station_id
33    771713
35    729595
24    673579
44    620700
49    567315
39    547247
28    521952
2     506213
8     502316
48    492028
Name: ridership, dtype: int64
station_id
33    771713
35    729595
24    673579
44    620700
49    567315
39    547247
28    521952
2     506213
8     502316
48    492028
Name: ridership, dtype:

## Summary of Data Exploration

### Datasets Status

Fill in after running above cells:

| Dataset | Status | Rows | Columns | Quality Issues |
|---------|--------|------|---------|----------------|
| Bangkok Traffic | ‚òê Loaded / ‚òê Missing | - | - | - |
| US Accidents | ‚òê Loaded / ‚òê Missing | - | - | - |
| Weather | ‚òê Loaded / ‚òê Missing | - | - | - |
| OSM Roads | ‚òê Loaded / ‚òê Missing | - | - | - |
| Transit | ‚òê Loaded / ‚òê Missing | - | - | - |

### Key Findings

1. **Data Availability:**
   - [ ] All 5 datasets acquired
   - [ ] Missing datasets identified

2. **Data Quality Issues:**
   - Missing values: 
   - Outliers detected: 
   - Data type issues: 

3. **Next Steps:**
   - [ ] Download missing datasets
   - [ ] Proceed to data cleaning (Notebook 02)
   - [ ] Document data quality issues

---

## Next Notebook

‚Üí **02_Data_Cleaning.ipynb** - Clean and prepare data for analysis

In [29]:
print("\n" + "=" * 70)
print("üìã EXPLORATION SUMMARY")
print("=" * 70)

datasets_loaded = sum([df_traffic is not None, df_accidents is not None, 
                       df_weather is not None, df_osm is not None, 
                       df_transit is not None])
print(f"\n‚úÖ Datasets Successfully Loaded: {datasets_loaded}/5")
print(f"   - Traffic Data: {'‚úÖ' if df_traffic is not None else '‚ùå'}")
print(f"   - Accidents Data: {'‚úÖ' if df_accidents is not None else '‚ùå'}")
print(f"   - Weather Data: {'‚úÖ' if df_weather is not None else '‚ùå'}")
print(f"   - OSM Data: {'‚úÖ' if df_osm is not None else '‚ùå'}")
print(f"   - Transit Data: {'‚úÖ' if df_transit is not None else '‚ùå'}")

total_records = sum([len(df) if df is not None else 0 for df in 
                     [df_traffic, df_accidents, df_weather, df_osm, df_transit]])
print(f"\nüìä Total Records Analyzed: {total_records:,}")

print(f"\nüéØ Next Steps:")
print(f"   1. Review data quality findings above")
print(f"   2. Continue to 02_Data_Cleaning.ipynb for preprocessing")
print(f"   3. Handle missing values and outliers")
print(f"   4. Engineer new features from raw data")

print("\n" + "=" * 70)
print("‚úÖ Data Exploration Complete!")


üìã EXPLORATION SUMMARY

‚úÖ Datasets Successfully Loaded: 5/5
   - Traffic Data: ‚úÖ
   - Accidents Data: ‚úÖ
   - Weather Data: ‚úÖ
   - OSM Data: ‚úÖ
   - Transit Data: ‚úÖ

üìä Total Records Analyzed: 3,012

üéØ Next Steps:
   1. Review data quality findings above
   2. Continue to 02_Data_Cleaning.ipynb for preprocessing
   3. Handle missing values and outliers
   4. Engineer new features from raw data

‚úÖ Data Exploration Complete!
