In [5]:
# Comprehensive Visualization & EDA
# Merging Traffy Fondue Data with External Factors (Weather, PM2.5, Holidays)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Thai font support for Matplotlib
import matplotlib.font_manager as fm
# Attempt to find a Thai font
candidates = ['TH Sarabun New', 'Noto Sans Thai', 'Tahoma', 'Segoe UI']
available = {f.name for f in fm.fontManager.ttflist}
found = [c for c in candidates if c in available]
if found:
    plt.rc('font', family=found[0])
    print(f"Using font: {found[0]}")
else:
    print("No specific Thai font found, using default.")

# Visual settings
sns.set(style="whitegrid")
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', None)

Using font: Tahoma


In [6]:
# Load Data
traffy_path = "../data/interim/bangkok_traffy_cleaned.csv"
external_path = "../data/external/bangkok_external_data.csv"

print("Loading Traffy data...")
df_traffy = pd.read_csv(traffy_path, parse_dates=['timestamp', 'last_activity'], low_memory=False)
print(f"Traffy data shape: {df_traffy.shape}")

print("Loading External data...")
df_external = pd.read_csv(external_path, parse_dates=['date'])
print(f"External data shape: {df_external.shape}")

# Preprocessing for Merge
# Extract date from timestamp in Traffy data to merge with External data
# Ensure timestamp is datetime objects before using .dt accessor
df_traffy['timestamp'] = pd.to_datetime(df_traffy['timestamp'], errors='coerce')
df_traffy['date'] = df_traffy['timestamp'].dt.date
df_traffy['date'] = pd.to_datetime(df_traffy['date'])

# Merge
print("Merging datasets...")
df_merged = pd.merge(df_traffy, df_external, on='date', how='left')
print(f"Merged data shape: {df_merged.shape}")

df_merged.head(3)

Loading Traffy data...
Traffy data shape: (633341, 20)
Loading External data...
External data shape: (1492, 19)
Traffy data shape: (633341, 20)
Loading External data...
External data shape: (1492, 19)
Merging datasets...
Merging datasets...
Merged data shape: (633341, 39)
Merged data shape: (633341, 39)


Unnamed: 0,ticket_id,type,organization,comment,photo,photo_after,coords,address,subdistrict,district,province,timestamp,state,count_reopen,last_activity,type_clean,has_photo,comment_len,lon,lat,date,pm25_avg,pm25_max,pm25_min,rainfall_mm,rainfall_hours,has_rain,heavy_rain,year,month,day_of_week,quarter,day_of_year,is_weekend,is_holiday,holiday,holiday_type,weather_severity,year_month
0,2021-CGPMUN,"น้ำท่วม,ร้องเรียน","เขตประเวศ,ฝ่ายโยธา เขตประเวศ",น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆ...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.66709,13.67891",189 เฉลิมพระเกียรติ ร.9 แขวง หนองบอน เขต ประเว...,หนองบอน,ประเวศ,กรุงเทพมหานคร,2021-09-19 14:56:08.924992+00:00,เสร็จสิ้น,0,2022-06-21 08:21:09.532782+00,"น้ำท่วม,ร้องเรียน",True,154,100.66709,13.67891,2021-09-19,25.35,,,1.6,9.0,1.0,0.0,2021.0,9.0,6.0,3.0,262.0,1.0,1.0,,,0.0,2021-09
1,2021-4D9Y98,,"เขตลาดพร้าว,การไฟฟ้านครหลวง เขตนวลจันทร์",หน้าปากซอย ลาดพร้าววังหิน26,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.59131,13.80910",17/73 17/73 ถ. ลาดพร้าววังหิน แขวงลาดพร้าว เขต...,ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-13 05:53:36.861064+00:00,เสร็จสิ้น,0,2023-03-14 12:09:14.947437+00,,True,27,100.59131,13.8091,2021-12-13,25.35,,,0.0,0.0,0.0,0.0,2021.0,12.0,0.0,4.0,347.0,0.0,0.0,,,0.0,2021-12
2,2021-7U9RED,,เขตดุสิต,ยังไม่มีหน่วยงานไหนมาดูแลครับ รถจะเชี่ยวหลายคน...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.50848,13.77832",627 ถนนสามเสน แขวง ดุสิต เขตดุสิต กรุงเทพมหานค...,ดุสิต,ดุสิต,กรุงเทพมหานคร,2021-12-17 08:46:02.610983+00:00,เสร็จสิ้น,0,2023-05-17 06:11:32.463984+00,,True,50,100.50848,13.77832,2021-12-17,25.35,,,0.0,0.0,0.0,0.0,2021.0,12.0,4.0,4.0,351.0,0.0,0.0,,,0.0,2021-12


In [7]:
# 1. Overview: Ticket Volume Over Time
daily_counts = df_merged.groupby('date').size().reset_index(name='ticket_count')

fig = px.line(daily_counts, x='date', y='ticket_count', title='Daily Ticket Volume (2021-2025)')
fig.update_xaxes(rangeslider_visible=True)
fig.show()

# 2. Top Issue Types
top_types = df_merged['type_clean'].value_counts().head(15).reset_index()
top_types.columns = ['type', 'count']

fig = px.bar(top_types, x='count', y='type', orientation='h', 
             title='Top 15 Issue Types', color='count', color_continuous_scale='Viridis')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [8]:
# 3. Geospatial Distribution (Heatmap)
# Filter for valid coordinates
df_geo = df_merged.dropna(subset=['lat', 'lon'])
# Sample if too large for performance
if len(df_geo) > 50000:
    df_geo_sample = df_geo.sample(50000, random_state=42)
else:
    df_geo_sample = df_geo

fig = px.density_mapbox(df_geo_sample, lat='lat', lon='lon', radius=10,
                        center=dict(lat=13.7563, lon=100.5018), zoom=10,
                        mapbox_style="open-street-map", title="Heatmap of Traffy Issues")
fig.show()


*density_mapbox* is deprecated! Use *density_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [9]:
# 4. Impact of Rainfall on Issues
# Let's look at specific issue types that might be related to rain: "น้ำท่วม" (Flooding), "ถนน" (Roads), "ต้นไม้" (Trees)
rain_related_keywords = ['น้ำท่วม', 'ระบายน้ำ', 'ถนน', 'ต้นไม้']
df_merged['is_rain_related'] = df_merged['type_clean'].astype(str).apply(lambda x: any(k in x for k in rain_related_keywords))

# Group by date
daily_stats = df_merged.groupby('date').agg({
    'ticket_id': 'count',
    'is_rain_related': 'sum',
    'rainfall_mm': 'first', # rainfall is same for all rows on that date
    'has_rain': 'first'
}).reset_index()

# Scatter plot: Rainfall vs Rain-related Issues
fig = px.scatter(daily_stats, x='rainfall_mm', y='is_rain_related', 
                 title='Daily Rainfall vs. Rain-Related Issues',
                 labels={'is_rain_related': 'Count of Rain/Flood/Road Issues', 'rainfall_mm': 'Rainfall (mm)'},
                 trendline="ols")
fig.show()

# Box plot: Issues on Rainy vs Non-Rainy Days
fig = px.box(daily_stats, x='has_rain', y='ticket_id', 
             title='Total Ticket Volume: Rainy vs. No-Rain Days',
             labels={'has_rain': 'Has Rain (0=No, 1=Yes)', 'ticket_id': 'Total Tickets'})
fig.show()

# 5. Impact of PM2.5 on Issues
# Look for "ฝุ่น" (Dust), "อากาศ" (Air), "เผา" (Burn)
dust_keywords = ['ฝุ่น', 'อากาศ', 'ควัน', 'กลิ่น']
df_merged['is_dust_related'] = df_merged['type_clean'].astype(str).apply(lambda x: any(k in x for k in dust_keywords))

daily_pm25 = df_merged.groupby('date').agg({
    'is_dust_related': 'sum',
    'pm25_avg': 'first'
}).reset_index()

# Scatter plot: PM2.5 vs Dust Issues
fig = px.scatter(daily_pm25, x='pm25_avg', y='is_dust_related',
                 title='Daily Avg PM2.5 vs. Dust/Air Issues',
                 labels={'is_dust_related': 'Count of Dust/Air Issues', 'pm25_avg': 'PM2.5 Avg (µg/m³)'},
                 trendline="ols")
fig.show()


invalid value encountered in scalar divide



In [10]:
# 6. Temporal Patterns: Day of Week & Hour
df_merged['day_name'] = df_merged['timestamp'].dt.day_name()
df_merged['hour'] = df_merged['timestamp'].dt.hour

# Order days
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Day of Week
dow_counts = df_merged['day_name'].value_counts().reindex(days_order).reset_index()
dow_counts.columns = ['day', 'count']

fig = px.bar(dow_counts, x='day', y='count', title='Ticket Volume by Day of Week', color='count')
fig.show()

# Hour of Day
hour_counts = df_merged['hour'].value_counts().sort_index().reset_index()
hour_counts.columns = ['hour', 'count']

fig = px.line(hour_counts, x='hour', y='count', title='Ticket Volume by Hour of Day', markers=True)
fig.show()

In [11]:
# 7. Impact of Holidays
# Compare average daily tickets: Holiday vs. Non-Holiday
holiday_stats = df_merged.groupby(['date', 'is_holiday']).size().reset_index(name='count')
avg_holiday = holiday_stats.groupby('is_holiday')['count'].mean().reset_index()
avg_holiday['is_holiday'] = avg_holiday['is_holiday'].map({0: 'Non-Holiday', 1: 'Holiday'})

fig = px.bar(avg_holiday, x='is_holiday', y='count', 
             title='Average Daily Tickets: Holiday vs. Non-Holiday',
             color='is_holiday')
fig.show()

In [12]:
# 5. Impact of PM2.5 on Issues
# Look for "ฝุ่น" (Dust), "อากาศ" (Air), "เผา" (Burn)
dust_keywords = ['ฝุ่น', 'อากาศ', 'ควัน', 'กลิ่น']
df_merged['is_dust_related'] = df_merged['type_clean'].astype(str).apply(lambda x: any(k in x for k in dust_keywords))

daily_pm25 = df_merged.groupby('date').agg({
    'is_dust_related': 'sum',
    'pm25_avg': 'first'
}).reset_index()

# Scatter plot: PM2.5 vs Dust Issues
fig = px.scatter(daily_pm25, x='pm25_avg', y='is_dust_related',
                 title='Daily Avg PM2.5 vs. Dust/Air Issues',
                 labels={'is_dust_related': 'Count of Dust/Air Issues', 'pm25_avg': 'PM2.5 Avg (µg/m³)'},
                 trendline="ols")
fig.show()


invalid value encountered in scalar divide



In [13]:
# 4. Impact of Rainfall on Issues
# Let's look at specific issue types that might be related to rain: "น้ำท่วม" (Flooding), "ถนน" (Roads), "ต้นไม้" (Trees)
rain_related_keywords = ['น้ำท่วม', 'ระบายน้ำ', 'ถนน', 'ต้นไม้']
df_merged['is_rain_related'] = df_merged['type_clean'].astype(str).apply(lambda x: any(k in x for k in rain_related_keywords))

# Group by date
daily_stats = df_merged.groupby('date').agg({
    'ticket_id': 'count',
    'is_rain_related': 'sum',
    'rainfall_mm': 'first', # rainfall is same for all rows on that date
    'has_rain': 'first'
}).reset_index()

# Scatter plot: Rainfall vs Rain-related Issues
fig = px.scatter(daily_stats, x='rainfall_mm', y='is_rain_related', 
                 title='Daily Rainfall vs. Rain-Related Issues',
                 labels={'is_rain_related': 'Count of Rain/Flood/Road Issues', 'rainfall_mm': 'Rainfall (mm)'},
                 trendline="ols")
fig.show()

# Box plot: Issues on Rainy vs Non-Rainy Days
fig = px.box(daily_stats, x='has_rain', y='ticket_id', 
             title='Total Ticket Volume: Rainy vs. No-Rain Days',
             labels={'has_rain': 'Has Rain (0=No, 1=Yes)', 'ticket_id': 'Total Tickets'})
fig.show()

In [14]:
# 3. Geospatial Distribution (Heatmap)
# Filter for valid coordinates
df_geo = df_merged.dropna(subset=['lat', 'lon'])
# Sample if too large for performance
if len(df_geo) > 50000:
    df_geo_sample = df_geo.sample(50000, random_state=42)
else:
    df_geo_sample = df_geo

fig = px.density_mapbox(df_geo_sample, lat='lat', lon='lon', radius=10,
                        center=dict(lat=13.7563, lon=100.5018), zoom=10,
                        mapbox_style="open-street-map", title="Heatmap of Traffy Issues")
fig.show()


*density_mapbox* is deprecated! Use *density_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [15]:
# 1. Overview: Ticket Volume Over Time
daily_counts = df_merged.groupby('date').size().reset_index(name='ticket_count')

fig = px.line(daily_counts, x='date', y='ticket_count', title='Daily Ticket Volume (2021-2025)')
fig.update_xaxes(rangeslider_visible=True)
fig.show()

# 2. Top Issue Types
top_types = df_merged['type_clean'].value_counts().head(15).reset_index()
top_types.columns = ['type', 'count']

fig = px.bar(top_types, x='count', y='type', orientation='h', 
             title='Top 15 Issue Types', color='count', color_continuous_scale='Viridis')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [16]:
# Comprehensive Visualization & EDA
# Merging Traffy Fondue Data with External Factors (Weather, PM2.5, Holidays)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Thai font support for Matplotlib
import matplotlib.font_manager as fm
# Attempt to find a Thai font
candidates = ['TH Sarabun New', 'Noto Sans Thai', 'Tahoma', 'Segoe UI']
available = {f.name for f in fm.fontManager.ttflist}
found = [c for c in candidates if c in available]
if found:
    plt.rc('font', family=found[0])
    print(f"Using font: {found[0]}")
else:
    print("No specific Thai font found, using default.")

# Visual settings
sns.set(style="whitegrid")
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', None)

Using font: Tahoma
