## Web scraping
- `https://www.timeanddate.com/weather/thailand/bangkok/historic`
- START_YEAR = `2024` , END_YEAR = `2025`
- Data 
    - 'month'
    - 'year'
    - 'temp_high'
    - 'temp_low'
    - 'humidity_high'
    - 'humidity_low'
    - 'pressure_high'
    - 'pressure_low'
- `High Temp`: ช่วยทำนายความเสียหายที่เกิดจากความร้อนสะสม
- `Low Temp`: ช่วยทำนายความเสียหายจากการหดตัวของวัสดุ
- `High Humidity`: บ่งชี้ว่าเดือนนั้นมีฝนตกหนักและโอกาสน้ำท่วมสูง
- `Low Humidity`: อาจบ่งชี้ถึงสภาวะแล้งหรือฝุ่นควัน

In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import re

BASE_URL = "https://www.timeanddate.com/weather/thailand/bangkok/historic"
SCRIBED_DATA = []
START_YEAR = 2024
END_YEAR = 2025

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

def clean_data(text):
    """ทำความสะอาดข้อความจากหน่วยวัดและดึงเฉพาะตัวเลขออกมา"""
    # ใช้ Regular Expression เพื่อดึงตัวเลขที่เป็นจำนวนเต็มหรือทศนิยม
    match = re.search(r'[\d\.]+', text)
    if match:
        try:
            return float(match.group(0))
        except ValueError:
            return None
    return None

def scrape_weather_month(year, month):
    url = f"{BASE_URL}?month={month}&year={year}"
    print(f"\n--- Scraping START: {year}-{month:02d} (Full Monthly Summary) ---")
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        print(f"DEBUG 1 (Status Code): {response.status_code}")
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # ค้นหาตารางสรุป High & Low Weather Summary
        table = soup.find('table', {'class': 'zebra tb-wt fw tb-hover'}) 
            
        if table is None:
            print(f"DEBUG 4 (Table Found): NO. (Class 'zebra tb-wt fw tb-hover' not found)")
            return []
        
        print("DEBUG 4 (Table Found)")

        rows = table.find_all('tr')
        
        if len(rows) < 3:
            return [] # ต้องมีแถว Header, High, Low
        # แถวที่ 1 (Index 1) คือ High
        # แถวที่ 2 (Index 2) คือ Low
        
        # High Row (row index 1)
        high_cols = rows[1].find_all('td')
        # Low Row (row index 2)
        low_cols = rows[2].find_all('td')

        if len(high_cols) < 3 or len(low_cols) < 3:
             print(f"DEBUG 5: Insufficient column data in High/Low rows (Expected >= 3, Got {len(high_cols)})")
             return []

        # ดึงข้อมูล High
        temp_high_month = clean_data(high_cols[0].text)     # Column 1 (Index 0)
        humidity_high_month = clean_data(high_cols[1].text) # Column 2 (Index 1)
        pressure_high_month = clean_data(high_cols[2].text) # Column 3 (Index 2)

        # ดึงข้อมูล Low
        temp_low_month = clean_data(low_cols[0].text)       # Column 1 (Index 0)
        humidity_low_month = clean_data(low_cols[1].text)   # Column 2 (Index 1)
        pressure_low_month = clean_data(low_cols[2].text)   # Column 3 (Index 2)


        monthly_record = {
            'month': month,
            'year': year,
            'temp_high': temp_high_month,
            'temp_low': temp_low_month,
            'humidity_high': humidity_high_month,
            'humidity_low': humidity_low_month,
            'pressure_high': pressure_high_month,
            'pressure_low': pressure_low_month
        }
        
        print(f"DEBUG 6 (Record Collected)")
        return [monthly_record]
        
    except requests.RequestException as e:
        print(f"DEBUG 1 (Status Code): FAILED (Request Error: {e})")
        return []

# --- Run Scraping Loop ---
for year in range(START_YEAR, END_YEAR + 1):
    start_month = 1 
    end_month = 12 

    if year == END_YEAR:
        end_month = 1

    for month in range(start_month, end_month + 1):
        results = scrape_weather_month(year, month)
        SCRIBED_DATA.extend(results)
        time.sleep(3) 
        
df_weather_scraped = pd.DataFrame(SCRIBED_DATA)

print(f"\n✅ Scraping Completed: {len(df_weather_scraped)} unique monthly records collected.")
print(f"Variables collected: {list(df_weather_scraped.columns)}")


--- Scraping START: 2024-01 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-02 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-03 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-04 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-05 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-06 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-07 (Full Monthly Summary) ---
DEBUG 1 (Status Code): 200
DEBUG 4 (Table Found)
DEBUG 6 (Record Collected)

--- Scraping START: 2024-08 (Full Monthly Summary) ---
DEBUG 1 (Status Code

In [19]:
df_weather_scraped.head()

Unnamed: 0,month,year,temp_high,temp_low,humidity_high,humidity_low,pressure_high,pressure_low
0,1,2024,35.0,23.0,94.0,28.0,1020.0,1008.0
1,2,2024,37.0,22.0,94.0,30.0,1021.0,1007.0
2,3,2024,39.0,25.0,94.0,20.0,1016.0,1005.0
3,4,2024,41.0,28.0,89.0,19.0,1012.0,1002.0
4,5,2024,40.0,24.0,100.0,29.0,1014.0,1001.0


## Merge
- `df_weather_scraped`
- temp_high ,temp_low ,humidity_high ,humidity_low ,pressure_high ,pressure_low ด้วย `month` and `year`

In [21]:
df_full = pd.read_csv("../data/traffy_sla_labeled_full.csv")
df_completed = pd.read_csv("../data/traffy_completed_for_model.csv")
df_pending = pd.read_csv("../data/traffy_pending_for_predict.csv")
df_completed_samples = pd.read_csv("../data_samples/02-completed_for_model_sample.csv")
df_pending_samples = pd.read_csv("../data_samples/02-pending_for_predict_sample.csv")

In [27]:
df_full = df_full.merge(
    df_weather_scraped,
    on=["year", "month"],
    how="left"
)

df_completed = df_completed.merge(
    df_weather_scraped,
    on=["year", "month"],
    how="left"
)

df_pending = df_pending.merge(
    df_weather_scraped,
    on=["year", "month"],
    how="left"
)

df_completed_samples = df_completed_samples.merge(
    df_weather_scraped,
    on=["year", "month"],
    how="left"
)

df_pending_samples = df_pending_samples.merge(
    df_weather_scraped,
    on=["year", "month"],
    how="left"
)


In [29]:
df_full.head(3)

Unnamed: 0,ticket_id,std_type,type,type_list,organization,comment,comment_length,photo,photo_after,lng,...,month,day,urgency,SLA_days,temp_high,temp_low,humidity_high,humidity_low,pressure_high,pressure_low
0,2024-CVGNCL,road,{ถนน},['ถนน'],"เขตราชเทวี,ฝ่ายเทศกิจ เขตราชเทวี",ปัญหา: บริเวณหน้าบ้านเลขที่ 425/1-2 พบคนเรร่อน...,351,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,100.54204,...,7,31,high,12.46195,36.0,24.0,100.0,47.0,1012.0,1001.0
1,2024-ATFNH8,cleaning,{ความสะอาด},['ความสะอาด'],"เขตลาดกระบัง,ฝ่ายรักษาความสะอาดฯ เขตลาดกระบัง",ขยะไม่เก็บ,10,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,100.73608,...,11,28,mid,11.866649,35.0,21.0,100.0,41.0,1016.0,1006.0
2,ZN3ZZV,road,"{ถนน,กีดขวาง}","['ถนน', 'กีดขวาง']","เขตลาดพร้าว,ฝ่ายเทศกิจ เขตลาดพร้าว",ปัญหา: ริมถนนดังกล่าว บริเวณหน้าเซ่เวน อีเลฟเว...,424,https://storage.googleapis.com/traffy_public_b...,https://storage.googleapis.com/traffy_public_b...,100.59014,...,12,7,high,12.46195,34.0,19.0,94.0,35.0,1018.0,1006.0


In [30]:
df_completed.head(3)

Unnamed: 0,ticket_id,std_type,district,subdistrict,province_clean,urgency,comment,comment_length,day_of_week,hour_of_day,...,lng,lat,SLA_days,breach,temp_high,temp_low,humidity_high,humidity_low,pressure_high,pressure_low
0,2024-CVGNCL,road,ราชเทวี,ถนนพญาไท,กรุงเทพมหานคร,high,ปัญหา: บริเวณหน้าบ้านเลขที่ 425/1-2 พบคนเรร่อน...,351,2,2,...,100.54204,13.7542,12.46195,1,36.0,24.0,100.0,47.0,1012.0,1001.0
1,2024-ATFNH8,cleaning,ลาดกระบัง,คลองสองต้นนุ่น,กรุงเทพมหานคร,mid,ขยะไม่เก็บ,10,3,8,...,100.73608,13.73114,11.866649,0,35.0,21.0,100.0,41.0,1016.0,1006.0
2,2024-ET2E9B,road,คลองสามวา,บางชัน,กรุงเทพมหานคร,high,ไฟส่องสว่าง บริเวณเกาะกลางดับ ถนนสามวา (สามวา ...,54,6,7,...,100.72685,13.84308,12.46195,1,40.0,24.0,100.0,29.0,1014.0,1001.0


In [31]:
df_pending.head(3)

Unnamed: 0,ticket_id,std_type,district,subdistrict,province_clean,urgency,comment,comment_length,day_of_week,hour_of_day,...,year,lng,lat,SLA_days,temp_high,temp_low,humidity_high,humidity_low,pressure_high,pressure_low
0,ZN3ZZV,road,ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,high,ปัญหา: ริมถนนดังกล่าว บริเวณหน้าเซ่เวน อีเลฟเว...,424,5,18,...,2024,100.59014,13.80482,12.46195,34.0,19.0,94.0,35.0,1018.0,1006.0
1,2024-N9H4PR,road,พญาไท,สามเสนใน,กรุงเทพมหานคร,high,โคมไฟที่ติดตั้งใต้ท้องสะพาน ของทางพิเศษเฉลิมมห...,723,5,11,...,2024,100.54818,13.76384,12.46195,39.0,25.0,94.0,20.0,1016.0,1005.0
2,2024-GDN42R,road,หนองจอก,คลองสิบสอง,กรุงเทพมหานคร,high,ไฟดับ ถนนประชาสำราญ บริเวณตรงข้ามร้าน น้องหมิว...,98,5,3,...,2024,100.86285,13.89961,12.46195,34.0,19.0,94.0,35.0,1018.0,1006.0


## Save data

In [32]:
df_full.to_csv("../data/traffy_sla_labeled_full_with_weather.csv", index=False)
df_completed.to_csv("../data/traffy_completed_for_model_with_weather.csv", index=False)
df_pending.to_csv("../data/traffy_pending_for_predict_with_weather.csv", index=False)
df_completed_samples.to_csv("../data_samples/03-traffy_completed_for_model_with_weather.csv", index=False)
df_pending_samples.to_csv("../data_samples/03-traffy_pending_for_predict_with_weather.csv", index=False)