In [19]:
import pandas as pd

In [20]:
df = pd.read_csv("../data/raw/bangkok_traffy.csv")
df.sample(5)

Unnamed: 0,ticket_id,type,organization,comment,photo,photo_after,coords,address,subdistrict,district,province,timestamp,state,star,count_reopen,last_activity
141443,2022-6VETY4,{},"เขตบางรัก,เขตสาทร,สน.ยานนาวา,ฝ่ายเทศกิจ เขตสาท...",รบกวนกวดขันรถไม่ติดแผ่นป้ายทะเบียนด้วยครับ,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.51376,13.71901",สะพานตากสิน แขวง ยานนาวา เขต สาทร กรุงเทพมหานค...,บางรัก,บางรัก,กรุงเทพมหานคร,2022-09-11 04:12:22.12772+00,เสร็จสิ้น,4.0,0,2023-02-07 04:41:30.370022+00
533783,G748PE,{ถนน},"เขตจอมทอง,ฝ่ายสิ่งแวดล้อมฯ เขตจอมทอง",ปัญหา: เรื่องเดิม มีดอกหญ้าปลิวเข้าบ้านจำนวนมา...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.47009,13.66759",1/20 ถ. พระราม ๒ แขวงบางมด เขตจอมทอง กรุงเทพมห...,บางมด,จอมทอง,จังหวัดกรุงเทพมหานคร,2024-03-25 06:19:54.055843+00,เสร็จสิ้น,,0,2024-04-10 06:12:24.617929+00
785875,2025-7KHGCG,"{จราจร,กีดขวาง,ถนน}","เขตราษฎร์บูรณะ,สน.บางคอแหลม",15/1/2568\n06:40 น \nถนนราษฎร์บูรณะ บริเวณ ริม...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.49283,13.68650",351/6 ถ. ราษฎร์บูรณะ แขวงบางปะกอก เขตราษฎร์บูร...,บางปะกอก,ราษฎร์บูรณะ,กรุงเทพมหานคร,2025-01-14 23:38:28.942083+00,เสร็จสิ้น,,0,2025-01-15 00:27:50.167537+00
209794,DXJJ3W,{},เขตบางบอน,*รบกวนให้ข้อมูลประชาชนเพิ่มเติม\nคำถาม : ร้องท...,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,"100.36901,13.63490",181/23 ถ. เอกชัย บางบอนเหนือ เขตบางบอน กรุงเทพ...,บางบอนใต้,บางบอน,จังหวัดกรุงเทพมหานคร,2023-02-02 06:21:22.081649+00,เสร็จสิ้น,,0,2023-02-10 00:23:41.518603+00
95281,2022-C4P8NK,{น้ำท่วม},"เขตสะพานสูง,ฝ่ายโยธา เขตสะพานสูง,ผอ.เขตสะพานสู...",สวัสดีค่ะ \nซอยกรุงเทพกรีฑา 32 เมื่อคืนไม่มีฝน...,https://storage.googleapis.com/traffy_public_b...,,"100.69429,13.73857",1 ซอย กรุงเทพกรีฑา 47 แขวง สะพานสูง เขตสะพานสู...,สะพานสูง,สะพานสูง,กรุงเทพมหานคร,2022-07-22 00:43:37.978113+00,เสร็จสิ้น,5.0,0,2022-07-25 05:37:39.888145+00


In [21]:
import re

# Drop star column (too many missing values ~65%)
df = df.drop(columns=['star'])

# Drop rows where ticket_id is null (cannot identify the record)
df = df.dropna(subset=['ticket_id'])

# Function to extract subdistrict (แขวง) from address
def extract_subdistrict(address):
    if pd.isna(address):
        return None
    match = re.search(r'แขวง\s*([^\s]+)', str(address))
    return match.group(1) if match else None

# Function to extract district (เขต) from address
def extract_district(address):
    if pd.isna(address):
        return None
    match = re.search(r'เขต\s*([^\s]+)', str(address))
    return match.group(1) if match else None

# Fill missing subdistrict values from address
df['subdistrict'] = df.apply(
    lambda row: extract_subdistrict(row['address']) if pd.isna(row['subdistrict']) else row['subdistrict'],
    axis=1
)

# Fill missing district values from address
df['district'] = df.apply(
    lambda row: extract_district(row['address']) if pd.isna(row['district']) else row['district'],
    axis=1
)

print(f"Rows after cleaning: {len(df)}")

Rows after cleaning: 778254


## Data Type Conversion

In [22]:
# Convert timestamp columns to datetime (using mixed format for inconsistent timestamps)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
df['last_activity'] = pd.to_datetime(df['last_activity'], format='mixed')

# Split coords into latitude and longitude
df[['longitude', 'latitude']] = df['coords'].str.split(',', expand=True)
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')

# Strip whitespace from text columns
text_columns = ['address', 'subdistrict', 'district', 'province', 'state', 'type']
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].str.strip()

# Remove duplicates based on ticket_id
df = df.drop_duplicates(subset=['ticket_id'], keep='first')

print(f"Rows after all cleaning steps: {len(df)}")

Rows after all cleaning steps: 778254


## Feature Engineering

In [23]:
# Extract temporal features
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek  # 0=Monday, 6=Sunday
df['day_name'] = df['timestamp'].dt.day_name()

# Calculate resolution time (time taken to update/resolve)
df['resolution_time_hours'] = (df['last_activity'] - df['timestamp']).dt.total_seconds() / 3600

# Create binary flags
df['has_photo_after'] = df['photo_after'].notna().astype(int)
df['is_reopened'] = (df['count_reopen'] > 0).astype(int)

print("Created new features:")
print(f"- Temporal: year, month, day, hour, day_of_week, day_name")
print(f"- Duration: resolution_time_hours")
print(f"- Flags: has_photo_after, is_reopened")
print(f"\nNew shape: {df.shape}")

Created new features:
- Temporal: year, month, day, hour, day_of_week, day_name
- Duration: resolution_time_hours
- Flags: has_photo_after, is_reopened

New shape: (778254, 26)


## Save Cleaned Data

In [24]:
# Save cleaned data to processed folder
output_path = "../data/interim/bangkok_traffy_cleaned.csv"
df.to_csv(output_path, index=False)

print(f"✓ Cleaned data saved to: {output_path}")
print(f"✓ Total rows saved: {len(df)}")
print(f"✓ Total columns: {len(df.columns)}")

✓ Cleaned data saved to: ../data/interim/bangkok_traffy_cleaned.csv
✓ Total rows saved: 778254
✓ Total columns: 26
