In [23]:
import pandas as pd
from pathlib import Path
from IPython.display import display
from sklearn.preprocessing import OneHotEncoder

## Inspect Dataframe

def inspect_dataframe(df)

In [24]:
def inspect_dataframe(df):
    # ทำสำเนาเพื่อความปลอดภัย (กันการแก้ไขของเดิมโดยไม่ตั้งใจ)
    df_safe = df.copy()

    print(f"Loaded {len(df_safe):,} rows × {len(df_safe.columns):,} columns")
    display(df_safe.head())

    print("\nDataFrame info:")
    df_safe.info()

    print("\nMissing values per column:")
    print(df_safe.isna().sum())
    print("----------------------------------------")

## drop_rows_with_nan

drop_rows_with_nan(df, cols, inplace=False)

In [25]:
def drop_rows_with_nan(df, cols, inplace=False):
    """
    Drop rows that have NaN in any column listed in `cols`.
    - df: pandas DataFrame
    - cols: list-like of column names to check for NaN
    - inplace: if True, modify df in-place and return it; otherwise return a new DataFrame
    """
    cols = list(cols)
    missing_cols = [c for c in cols if c not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns not found in DataFrame: {missing_cols}")

    before = len(df)
    result = df.dropna(subset=cols)
    mask_bad = result[cols].apply(lambda s: s.astype(str).str.strip().str.lower().eq("unknown")).any(axis=1)
    result = result[~mask_bad]
    after = len(result)
    print(f"Dropped {before - after} rows with NaN in {cols} (before: {before}, after: {after})")
    print("----------------------------------------")

    if inplace:
        # replace contents of original DataFrame
        df.drop(df.index, inplace=True)
        for col in result.columns:
            df[col] = result[col]
        return df
    return result

## add_time_range_and_filter

In [26]:
import pandas as pd

def add_time_range_and_filter(df, start_col="start", end_col="end", duration_col="duration_minutes"):
    """
    - Convert start/end columns to datetime
    - Remove rows where end < start
    - Add a column representing duration in minutes
    """
    df = df.copy()  # avoid modifying original df

    # Convert HH:MM string → datetime time-of-day
    df["_start_dt"] = pd.to_datetime(df[start_col], format="%H:%M", errors="coerce")
    df["_end_dt"]   = pd.to_datetime(df[end_col],   format="%H:%M", errors="coerce")

    # Count rows before filtering
    total_rows = len(df)

    # Identify invalid rows
    invalid_mask = df["_end_dt"] < df["_start_dt"]
    invalid_rows = df[invalid_mask]

    # Logging
    print("===== Time Range Cleaning Report =====")
    print(f"Total rows before cleaning : {total_rows}")
    print(f"Rows removed (end < start) : {len(invalid_rows)}")

    # Keep valid rows only
    df = df[~invalid_mask].copy()

    # Compute duration in minutes
    df[duration_col] = (df["_end_dt"] - df["_start_dt"]).dt.total_seconds() / 60.0
    df[range] = ( df["_start_dt"].dt.strftime("%H:%M") + " - " + df["_end_dt"].dt.strftime("%H:%M") )

    # Logging after cleaning
    print(f"Total rows after cleaning  : {len(df)}")
    print("======================================\n")

    # Drop helper columns
    df = df.drop(columns=["_start_dt", "_end_dt"])

    return df


## filter bangkok

In [27]:
import pandas as pd

def add_time_range_and_filter(
    df, 
    start_col="start", 
    end_col="end", 
    range_col="time_range", 
    duration_col="duration_minutes"
):
    """
    - Convert start/end columns to time
    - Remove rows where end < start
    - Add column showing time range string (start - end)
    - Add column showing duration in hours (float)
    """
    df = df.copy()

    # Convert HH:MM string → datetime
    df["_start_dt"] = pd.to_datetime(df[start_col], format="%H:%M", errors="coerce")
    df["_end_dt"]   = pd.to_datetime(df[end_col],   format="%H:%M", errors="coerce")

    # Count rows before cleaning
    total_rows = len(df)

    # Remove invalid rows (end < start)
    invalid_mask = df["_end_dt"] < df["_start_dt"]
    invalid_rows = df[invalid_mask]

    print("===== Time Range Cleaning Report =====")
    print(f"Total rows before cleaning : {total_rows}")
    print(f"Rows with end < start      : {len(invalid_rows)}")

    # Filter valid rows
    df = df[~invalid_mask].copy()

    # 1️⃣ Add new time range column ("HH:MM - HH:MM")
    df[range_col] = (
        df["_start_dt"].dt.strftime("%H:%M")
        + " - "
        + df["_end_dt"].dt.strftime("%H:%M")
    )

    # 2️⃣ Add duration (in hours)
    df[duration_col] = (df["_end_dt"] - df["_start_dt"]).dt.total_seconds() / 60.0

    print(f"Total rows after cleaning  : {len(df)}")
    print("======================================\n")

    # Clean helper columns
    df = df.drop(columns=["_start_dt", "_end_dt"])

    return df


## call Function

In [28]:
# download data
path = Path("../data/scraping_data.csv")
if not path.exists():
    raise FileNotFoundError(f"{path} not found in {Path.cwd()}")

df_raw = pd.read_csv(path)

In [29]:
df = df_raw.copy()  # work on a copy to keep raw data intact

# 0.) inspect
inspect_dataframe(df)

# 1.) drop rows with nan
df = drop_rows_with_nan(df, ["date", "day_of_week", "start", "end", "location", "district", "province", "temp", "rain", "wind_gust"])

# 2.) add_time_range_and_filter 
df = add_time_range_and_filter(df)

# 3.) drop location
df = df.drop(columns='location')

# 4.) filter just bangkok
df = df[df["province"].str.lower() == "bangkok"]

Loaded 11,355 rows × 10 columns


Unnamed: 0,date,day_of_week,start,end,location,district,province,temp,rain,wind_gust
0,2025-12-05,Friday,08:30,15:30,"Ratchaphruek Road, Soi nearby Rama 5 Road Roun...",Unknown,Nonthaburi,,,
1,2025-12-05,Friday,08:30,15:30,The area along the side of Bang Kruai - Sai No...,Unknown,Nonthaburi,,,
2,2025-12-04,Thursday,09:00,12:00,"Seri Thai Road, Soi Seri Thai 14, Bangchak Gas...",Bung Kum,Bangkok,,,
3,2025-12-04,Thursday,09:00,14:00,"Pracha Ruam Chai Road, Soi Pracha Ruam Chai 45",Saen Khlong Tan,Bangkok,,,
4,2025-12-04,Thursday,09:00,12:00,"Bypass Nonthaburi Road, the area nearby the be...",Unknown,Nonthaburi,,,



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11355 entries, 0 to 11354
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         11355 non-null  object 
 1   day_of_week  11355 non-null  object 
 2   start        11355 non-null  object 
 3   end          11355 non-null  object 
 4   location     11355 non-null  object 
 5   district     11355 non-null  object 
 6   province     11355 non-null  object 
 7   temp         11322 non-null  float64
 8   rain         11322 non-null  float64
 9   wind_gust    11322 non-null  float64
dtypes: float64(3), object(7)
memory usage: 887.2+ KB

Missing values per column:
date            0
day_of_week     0
start           0
end             0
location        0
district        0
province        0
temp           33
rain           33
wind_gust      33
dtype: int64
----------------------------------------
Dropped 3175 rows with NaN in ['date', 'day_of_week', 

In [30]:
inspect_dataframe(df)

Loaded 6,745 rows × 11 columns


Unnamed: 0,date,day_of_week,start,end,district,province,temp,rain,wind_gust,time_range,duration_minutes
34,2025-11-26,Wednesday,08:30,13:30,Kannayao,Bangkok,23.3435,0.0,14.04,08:30 - 13:30,300.0
35,2025-11-26,Wednesday,09:00,14:00,Thawi Watthana,Bangkok,23.3435,0.0,14.04,09:00 - 14:00,300.0
38,2025-11-24,Monday,09:00,13:00,Taling Chan,Bangkok,24.8435,0.0,24.119999,09:00 - 13:00,240.0
41,2025-11-24,Monday,08:30,13:30,Lat Phrao,Bangkok,25.075998,0.0,24.119999,08:30 - 13:30,300.0
42,2025-11-23,Sunday,08:00,12:00,Phra Nakhon,Bangkok,22.8935,0.0,19.8,08:00 - 12:00,240.0



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 6745 entries, 34 to 11354
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              6745 non-null   object 
 1   day_of_week       6745 non-null   object 
 2   start             6745 non-null   object 
 3   end               6745 non-null   object 
 4   district          6745 non-null   object 
 5   province          6745 non-null   object 
 6   temp              6745 non-null   float64
 7   rain              6745 non-null   float64
 8   wind_gust         6745 non-null   float64
 9   time_range        6745 non-null   object 
 10  duration_minutes  6745 non-null   float64
dtypes: float64(4), object(7)
memory usage: 632.3+ KB

Missing values per column:
date                0
day_of_week         0
start               0
end                 0
district            0
province            0
temp                0
rain                0
wind_gust 

## Mapping province to coords

In [31]:
import pandas as pd
import numpy as np

# ---------------------------------------------------------
# 1) Coordinate mapping (edit / extend this dictionary)
#    Keys are normalized location names (lower + single space).
# ---------------------------------------------------------
COORDS = {
    # --- Bangkok districts (examples) ---
    "kannayao": (13.8449, 100.6764),
    "khan na yao": (13.8340, 100.6940),
    "thawi watthana": (13.7800, 100.3530),
    "taling chan": (13.7720, 100.4570),
    "lat phrao": (13.8060, 100.6070),
    "phra nakhon": (13.7563, 100.4976),
    "sathon": (13.7180, 100.5290),
    "sathorn": (13.7180, 100.5290),  # alias
    "nonthaburi": (13.8591, 100.5217),
    "phaya thai": (13.7860, 100.5420),
    "bang khun thian": (13.6365, 100.4344),
    "din daeng": (13.7833, 100.5650),
    "watthana": (13.7350, 100.5800),
    "wattana": (13.7350, 100.5800),
    "saphan sung": (13.7710, 100.7000),
    "bang rak": (13.7290, 100.5302),
    "suan luang": (13.7320, 100.6460),
    "bang na": (13.6680, 100.6050),
    "don mueang": (13.9100, 100.5910),
    "pom prap sattru phai": (13.7537, 100.5090),
    "thung khru": (13.6400, 100.5070),
    "khlong san": (13.7289, 100.5103),
    "nong khaem": (13.7080, 100.3890),
    "bueng kum": (13.8070, 100.6750),
    "bung kum": (13.8070, 100.6750),
    "sai mai": (13.9210, 100.6390),
    "lat krabang": (13.7276, 100.7940),
    "min buri": (13.8120, 100.7473),
    "bang bon": (13.6480, 100.4080),
    "bang bon ": (13.6480, 100.4080),
    "bang khen": (13.8717, 100.5960),
    "bangkok noi": (13.7760, 100.4700),
    "dusit": (13.7768, 100.5147),
    "bang phlat": (13.7900, 100.4980),
    "bang phlad": (13.7900, 100.4980),
    "thon buri": (13.7286, 100.4890),
    "bang khae": (13.6940, 100.4090),
    "lat yao": (13.8300, 100.5890),
    "pathum wan": (13.7448, 100.5347),
    "pathumwan": (13.7448, 100.5347),
    "phasi charoen": (13.7130, 100.4440),
    "pasi charoen": (13.7130, 100.4440),
    "lak si": (13.8820, 100.5700),
    "laksi": (13.8820, 100.5700),
    "chom thong": (13.6860, 100.4730),
    "nong chok": (13.8250, 100.8780),
    "yan nawa": (13.6980, 100.5450),
    "yannawa": (13.6980, 100.5450),
    "khlong sam wa": (13.8560, 100.7310),
    "huai khwang": (13.7761, 100.5716),
    "bang sue": (13.8060, 100.5300),
    "bang kho laem": (13.6950, 100.5080),
    "bang khlo laem": (13.6950, 100.5080),
    "bangkapi": (13.7649, 100.6420),
    "bang kapi": (13.7649, 100.6420),
    "chatuchak": (13.8280, 100.5540),
    "bang phli": (13.5840, 100.7510),
    "phra pradaeng": (13.6580, 100.5370),

    # big-area aliases
    "bangkok": (13.7563, 100.5018),
    "sukhumvit": (13.7200, 100.5680),
    "sukhumhumvit": (13.7200, 100.5680),
    "srinakarin": (13.6900, 100.6460),

    # nearby provinces
    "samut prakan": (13.5991, 100.5998),
    "phra nakhon si ayutthaya": (14.3532, 100.5689),

    # TODO: add more from your list as needed...
}

# ---------------------------------------------------------
# 2) Helper: normalize text (lower, clean spaces, etc.)
# ---------------------------------------------------------
def normalize_location_name(raw: str) -> str | None:
    """Normalize raw location string to a simple 'key' in COORDS."""
    if not isinstance(raw, str):
        return None

    # Lowercase and strip basic whitespace
    s = raw.strip().lower()

    # Replace some punctuation with spaces
    for ch in [",", ".", "(", ")", "’", "'"]:
        s = s.replace(ch, " ")

    # Normalize different dashes to spaces
    for ch in ["-", "–", "—", "/"]:
        s = s.replace(ch, " ")

    # Collapse multiple spaces into one
    s = " ".join(s.split())

    return s if s else None

# ---------------------------------------------------------
# 3) Main function: add lat/lon, drop unmapped rows
# ---------------------------------------------------------
def attach_coords_and_drop_unmapped(
    df: pd.DataFrame,
    location_col: str = "province"
) -> pd.DataFrame:
    """
    Clean location column, map to lat/lon using COORDS,
    and drop rows which cannot be mapped.
    """
    df = df.copy()

    # Create normalized key column
    df["_loc_key"] = df[location_col].apply(normalize_location_name)

    # Map to coordinates (lat, lon)
    def lookup_lat(key):
        if key is None:
            return np.nan
        coord = COORDS.get(key)
        return coord[0] if coord is not None else np.nan

    def lookup_lon(key):
        if key is None:
            return np.nan
        coord = COORDS.get(key)
        return coord[1] if coord is not None else np.nan

    df["latitude"] = df["_loc_key"].apply(lookup_lat)
    df["longitude"] = df["_loc_key"].apply(lookup_lon)

    # Drop rows where mapping failed
    before = len(df)
    df_clean = df.dropna(subset=["latitude", "longitude"]).reset_index(drop=True)
    after = len(df_clean)

    print(f"Total rows before mapping : {before}")
    print(f"Total rows after mapping  : {after}")
    print(f"Dropped rows (unmapped)   : {before - after}")

    # Optionally, drop helper column
    df_clean.drop(columns=["_loc_key"], inplace=True)

    return df_clean

# ---------------------------------------------------------
# 4) usage
# ---------------------------------------------------------
df_mapped = attach_coords_and_drop_unmapped(df, location_col="province")
df_mapped.head()

Total rows before mapping : 6745
Total rows after mapping  : 6745
Dropped rows (unmapped)   : 0


Unnamed: 0,date,day_of_week,start,end,district,province,temp,rain,wind_gust,time_range,duration_minutes,latitude,longitude
0,2025-11-26,Wednesday,08:30,13:30,Kannayao,Bangkok,23.3435,0.0,14.04,08:30 - 13:30,300.0,13.7563,100.5018
1,2025-11-26,Wednesday,09:00,14:00,Thawi Watthana,Bangkok,23.3435,0.0,14.04,09:00 - 14:00,300.0,13.7563,100.5018
2,2025-11-24,Monday,09:00,13:00,Taling Chan,Bangkok,24.8435,0.0,24.119999,09:00 - 13:00,240.0,13.7563,100.5018
3,2025-11-24,Monday,08:30,13:30,Lat Phrao,Bangkok,25.075998,0.0,24.119999,08:30 - 13:30,300.0,13.7563,100.5018
4,2025-11-23,Sunday,08:00,12:00,Phra Nakhon,Bangkok,22.8935,0.0,19.8,08:00 - 12:00,240.0,13.7563,100.5018


In [32]:
inspect_dataframe(df)

Loaded 6,745 rows × 11 columns


Unnamed: 0,date,day_of_week,start,end,district,province,temp,rain,wind_gust,time_range,duration_minutes
34,2025-11-26,Wednesday,08:30,13:30,Kannayao,Bangkok,23.3435,0.0,14.04,08:30 - 13:30,300.0
35,2025-11-26,Wednesday,09:00,14:00,Thawi Watthana,Bangkok,23.3435,0.0,14.04,09:00 - 14:00,300.0
38,2025-11-24,Monday,09:00,13:00,Taling Chan,Bangkok,24.8435,0.0,24.119999,09:00 - 13:00,240.0
41,2025-11-24,Monday,08:30,13:30,Lat Phrao,Bangkok,25.075998,0.0,24.119999,08:30 - 13:30,300.0
42,2025-11-23,Sunday,08:00,12:00,Phra Nakhon,Bangkok,22.8935,0.0,19.8,08:00 - 12:00,240.0



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 6745 entries, 34 to 11354
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              6745 non-null   object 
 1   day_of_week       6745 non-null   object 
 2   start             6745 non-null   object 
 3   end               6745 non-null   object 
 4   district          6745 non-null   object 
 5   province          6745 non-null   object 
 6   temp              6745 non-null   float64
 7   rain              6745 non-null   float64
 8   wind_gust         6745 non-null   float64
 9   time_range        6745 non-null   object 
 10  duration_minutes  6745 non-null   float64
dtypes: float64(4), object(7)
memory usage: 632.3+ KB

Missing values per column:
date                0
day_of_week         0
start               0
end                 0
district            0
province            0
temp                0
rain                0
wind_gust 

## Load data

In [33]:
# load .csv
df.to_csv("../data/clean_scraping_data.csv", index=False, encoding="utf-8-sig")