In [7]:
# !pip install -U sentence-transformers scikit-learn pandas numpy
# !pip install -U tqdm requests beautifulsoup4

import os, time, random
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import pandas as pd



In [None]:
trip_com = pd.read_csv('OTA/trip_com_bkk.csv')[['name','name href']].dropna()

MAX_WORKERS = 10                  # ปรับได้ 6–16 ตามสภาพแวดล้อม/ความเร็ว
TIMEOUT     = 20                  # วินาที
RETRIES     = 3                   # จำนวนครั้ง retry ต่อ URL
BACKOFF     = 0.6                 # ตัวคูณหน่วงเวลาระหว่าง retry

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

def make_session():
    s = requests.Session()
    retry = Retry(
        total=RETRIES,
        backoff_factor=BACKOFF,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=["GET", "HEAD", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=MAX_WORKERS, pool_maxsize=MAX_WORKERS)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update(HEADERS)
    return s

def parse_address(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")
    # selector หลักตามที่ให้มา
    span = soup.find("span", {"class": "headInit_headInit-address_text__D_Atv"})
    if span:
        return span.get_text(strip=True)
    # เผื่อ layout เปลี่ยน ลอง selector อื่น ๆ ที่พบบ่อย
    cand = soup.select_one("span[data-testid*=address], div[data-testid*=hotelAddress], .address")
    return cand.get_text(strip=True) if cand else None

def fetch_one(idx: int, url: str, session: requests.Session):
    try:
        # เพิ่ม jitter เล็กน้อยลด burst
        time.sleep(random.uniform(0.0, 0.15))
        r = session.get(url, timeout=TIMEOUT)
        r.raise_for_status()
        addr = parse_address(r.text)
        return idx, addr, None
    except Exception as e:
        return idx, None, e

# ---------- เตรียมลิสต์ URL ----------
urls = trip_com['name href'].tolist()

# ---------- ดึงแบบขนาน ----------
session = make_session()
addresses_dict = {}
errors = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = [ex.submit(fetch_one, i, u, session) for i, u in enumerate(urls)]
    for fut in tqdm(as_completed(futures), total=len(futures), desc="Fetching hotel addresses"):
        idx, addr, err = fut.result()
        addresses_dict[idx] = addr
        if err is not None:
            errors.append((idx, urls[idx], err))

# ---------- จัดให้อยู่ตามลำดับเดิม ----------
addresses = [addresses_dict.get(i) for i in range(len(urls))]
trip_com["address"] = addresses

# (ถ้าต้องการดู error ไม่เยอะ)
if errors:
    print(f"Errors: {len(errors)} / {len(urls)}")
    for i, u, e in errors[:10]:
        print(f"[{i}] {u} -> {e}")

trip_com.head()
trip_com.to_csv('OTA/trip_com_bkk.csv', index=False)

Fetching hotel addresses: 100%|██████████| 1069/1069 [01:29<00:00, 11.97it/s]


Unnamed: 0,name,name href,address
1,นาซ่า กรุงเทพฯ,https://th.trip.com/hotels/detail/?cityId=359&...,"44 ถ. สุขุมวิท 71 แขวงสวนหลวง, เขตสวนหลวง, กรุ..."
2,โรงแรมอีสติน แกรนด์ พญาไท,https://th.trip.com/hotels/detail/?cityId=359&...,"18 ถ.พญาไท ทุ่งพญาไท, เขตราชเทวี, กรุงเทพฯ, 10400"
4,โรงแรมเอเชียกรุงเทพ,https://th.trip.com/hotels/detail/?cityId=359&...,"269 ถ.พ, เขตราชเทวี, กรุงเทพฯ, 10400"
5,โรงแรมทวิน ทาวเวอร์,https://th.trip.com/hotels/detail/?cityId=359&...,"88 รองเมือง ปทุมวัน, ถ.พระราม 6, เขตปทุมวัน, ก..."
6,โรงแรมเดอะ เบอร์เคลีย์ ประตูน้ำ,https://th.trip.com/hotels/detail/?cityId=359&...,"559 ราชปรารภ, มักกะสัน ราชเทวี, เขตราชเทวี, กร..."


In [9]:
pd.read_csv('OTA/booking_com_bkk.csv').iloc[:,[0,2]]

Unnamed: 0,c17271c4d7 href,b87c397a13
0,https://www.booking.com/hotel/th/somerset-rama...,Somerset Rama 9 Bangkok
1,https://www.booking.com/hotel/th/thonglor-ekam...,Petch Tower Ekkamai
2,https://www.booking.com/hotel/th/thee.th.html?...,"THEE Bangkok by TH District ""newly renovated"""
3,https://www.booking.com/hotel/th/the-mustang-b...,The Mustang Blu
4,https://www.booking.com/hotel/th/public-house....,"Public House Bangkok, a Member of Design Hotels"
...,...,...
1620,https://www.booking.com/hotel/th/banthat-thong...,บรรทัดทองโฮสเทล
1621,https://www.booking.com/hotel/th/vaeh-viean-ho...,Vaeh Viean Silom
1622,https://www.booking.com/hotel/th/paradiso-bout...,พาราดิโซบูติคสวีท
1623,https://www.booking.com/hotel/th/silom-space-h...,Silom Space Hostel


In [6]:
agoda = pd.read_csv('OTA/agoda.csv')
agoda.head()

Unnamed: 0,PropertyCard__Link href,sc-eCssSg src,sc-eCssSg src 2,sc-eCssSg src 3,sc-eCssSg src 4,sc-eCssSg src 5,sc-eCssSg src 6,sc-eCssSg src 7,sc-eCssSg src 8,sc-eCssSg src 9,...,sc-dlfnbm 31,sc-dlfnbm 32,sc-dlfnbm 33,ScreenReaderOnly__ScreenReaderOnlyStyled-sc-szxtre-0 8,Spanstyled__SpanStyled-sc-16tp9kb-0 2,PropertyCardPrice__Currency,PropertyCardPrice__Value,sc-dlfnbm 34,sc-gsTCUz href 12,ScreenReaderOnly__ScreenReaderOnlyStyled-sc-szxtre-0 9
0,https://www.agoda.com/th-th/lub-d-bangkok-siam...,https://pix8.agoda.net/hotelImages/230759/-1/6...,https://pix8.agoda.net/hotelImages/230759/0/1f...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/property/76990112/0/ae8...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/property/76990112/0/bda...,https://pix8.agoda.net/hotelImages/230759/0/5d...,https://pix8.agoda.net/hotelImages/230759/1219...,https://pix8.agoda.net/property/76990112/0/8e0...,...,,,,,,,,,,
1,https://www.agoda.com/th-th/baiyoke-sky-hotel/...,https://pix8.agoda.net/hotelImages/10637/-1/87...,https://pix8.agoda.net/hotelImages/10637/-1/58...,https://pix8.agoda.net/hotelImages/10637/-1/42...,https://pix8.agoda.net/hotelImages/10637/-1/11...,https://pix8.agoda.net/hotelImages/10637/-1/ea...,https://pix8.agoda.net/property/77649553/0/563...,https://pix8.agoda.net/hotelImages/10637/-1/72...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/hotelImages/10637/-1/a7...,...,ลด 70% เหลืออีก 1 ห้อง,ลดจัดเต็ม! จองตอนนี้ ราคาดีสุดๆ,-70%,tooltip,ราคาสูงสุดที่มีคนจองห้องพักประเภทนี้เมื่อปีที่...,฿,3095.0,,,
2,https://www.agoda.com/th-th/chatrium-hotel-riv...,,,,,,,,,,...,,,,,,,,,,
3,https://www.agoda.com/th-th/solaria-nishitetsu...,https://pix8.agoda.net/hotelImages/14654101/-1...,https://pix8.agoda.net/hotelImages/14654101/17...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/hotelImages/14654101/-1...,https://pix8.agoda.net/hotelImages/14654101/-1...,https://pix8.agoda.net/hotelImages/14654101/-1...,https://pix8.agoda.net/property/77004904/0/73d...,https://pix8.agoda.net/hotelImages/14654101/71...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,...,,ลดจัดเต็ม! จองตอนนี้ ราคาดีสุดๆ,-61%,tooltip,ราคาสูงสุดที่มีคนจองห้องพักประเภทนี้เมื่อปีที่...,฿,3847.0,•,[object Object],"ราคาปกติ: ฿ 3,847"
4,https://www.agoda.com/th-th/amari-watergate-ho...,https://pix8.agoda.net/hotelImages/10632/-1/7d...,https://pix8.agoda.net/hotelImages/10632/-1/63...,https://pix8.agoda.net/property/10632/0/d9328b...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/property/69368850/0/809...,https://pix8.agoda.net/hotelImages/10632/-1/50...,https://q-xx.bstatic.com/xdata/images/hotel/ma...,https://pix8.agoda.net/hotelImages/10632/-1/30...,https://pix8.agoda.net/property/10632/0/41e87a...,...,,,-58%,tooltip,ราคาสูงสุดที่มีคนจองห้องพักประเภทนี้เมื่อปีที่...,฿,3570.0,,,tooltip


In [18]:
# -*- coding: utf-8 -*-
import re, json, time, random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ----------------- I/O -----------------
df = pd.read_csv("OTA/booking_com_bkk.csv")
booking_com = df.iloc[:, [0, 2]].copy()
booking_com.columns = ["name href", "name"]

# ----------------- Config -----------------
MAX_WORKERS, TIMEOUT, RETRIES, BACKOFF = 10, 20, 3, 0.6
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

def make_session() -> requests.Session:
    s = requests.Session()
    retry = Retry(total=RETRIES, backoff_factor=BACKOFF,
                  status_forcelist=(429,500,502,503,504),
                  allowed_methods=["GET","HEAD","OPTIONS"], raise_on_status=False)
    adapter = HTTPAdapter(max_retries=retry, pool_connections=MAX_WORKERS, pool_maxsize=MAX_WORKERS)
    s.mount("http://", adapter); s.mount("https://", adapter); s.headers.update(HEADERS)
    return s

def clean(s: str | None) -> str | None:
    return re.sub(r"\s+", " ", s).strip() if s else None

def parse_address(html: str) -> str | None:
    soup = BeautifulSoup(html, "html.parser")
    # ตัดของที่ไม่ใช่เนื้อหา
    for bad in soup.select('[aria-hidden="true"],[hidden],script,style,noscript'):
        bad.decompose()

    # 1) JSON-LD (แม่นสุด)
    for tag in soup.select('script[type="application/ld+json"]'):
        try:
            data = json.loads(tag.string or "")
            objs = data if isinstance(data, list) else [data]
            for o in objs:
                a = o.get("address") if isinstance(o, dict) else None
                if isinstance(a, dict):
                    parts = [a.get("streetAddress"), a.get("addressLocality"), a.get("addressRegion"), a.get("postalCode")]
                    country = a.get("addressCountry"); 
                    if isinstance(country, dict): country = country.get("name") or country.get("@id")
                    if country: parts.append(country)
                    txt = clean(", ".join([str(x) for x in parts if x]))
                    if txt: return txt
        except Exception:
            pass

    # 2) ตาม DOM ที่แนบ: container -> inner address div
    inner = soup.select_one('div.ca9d921c46.a21cb847ab div.b99b6ef58f.cb4b7a25d9.b06461926f') \
         or soup.select_one('div.ca9d921c46 div[class*="b99b6ef58f"][class*="cb4b7a25d9"]')
    if inner:
        txt = clean(inner.get_text(" ", strip=True))
        if txt: return txt

    # 3) ฟอลแบ็กทั่วไป
    cand = soup.select_one('[data-testid="address"], .hp_address_subtitle, span[data-node_tt_id*="address"], address')
    if cand:
        txt = clean(cand.get_text(" ", strip=True))
        if txt: return txt

    # 4) ฟอลแบ็กสุดท้าย: หา ZIP 5 หลักแล้วไต่ขึ้น
    m = soup.find(string=re.compile(r'\b\d{5}\b'))
    node = getattr(m, "parent", None)
    for _ in range(4):
        if not node: break
        if hasattr(node, "get_text"):
            txt = clean(node.get_text(" ", strip=True))
            if txt and "หลังจากจองห้องพัก" not in txt:
                return clean(re.split(r"หลังจากจองห้องพัก", txt)[0])
        node = getattr(node, "parent", None)
    return None

def fetch_one(i: int, url: str, session: requests.Session):
    try:
        time.sleep(random.uniform(0.0, 0.2))
        r = session.get(url, timeout=TIMEOUT); r.raise_for_status()
        return i, parse_address(r.text), None
    except Exception as e:
        return i, None, e

# ----------------- Run -----------------
urls = booking_com["name href"].tolist()
session = make_session()
addresses, errors = {}, []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = [ex.submit(fetch_one, i, u, session) for i, u in enumerate(urls)]
    for fut in tqdm(as_completed(futures), total=len(futures), desc="Fetching hotel addresses"):
        i, addr, err = fut.result()
        addresses[i] = addr
        if err: errors.append((i, urls[i], err))

booking_com["address"] = [addresses.get(i) for i in range(len(urls))]

if errors:
    print(f"Errors: {len(errors)} / {len(urls)}")
    for i, u, e in errors[:10]: print(f"[{i}] {u} -> {e}")

booking_com.to_csv("OTA/booking_com_bkk_with_addr.csv", index=False)
print("Saved -> OTA/booking_com_bkk_with_addr.csv")


Fetching hotel addresses: 100%|██████████| 1625/1625 [16:26<00:00,  1.65it/s]

Saved -> OTA/booking_com_bkk_with_addr.csv





In [20]:
pd.read_csv("OTA/booking_com_bkk_with_addr.csv")

Unnamed: 0,name href,name,address
0,https://www.booking.com/hotel/th/somerset-rama...,Somerset Rama 9 Bangkok,"22 Ratchadaphisek Road Huai Khwang, Huai Kwang..."
1,https://www.booking.com/hotel/th/thonglor-ekam...,Petch Tower Ekkamai,
2,https://www.booking.com/hotel/th/thee.th.html?...,"THEE Bangkok by TH District ""newly renovated""","9/1 Sukumvit 20, Wattana, คลองเตย, 10110 กรุงเ..."
3,https://www.booking.com/hotel/th/the-mustang-b...,The Mustang Blu,"721 Maitri Chit Road Pom Prap, Pom Prap Sattru..."
4,https://www.booking.com/hotel/th/public-house....,"Public House Bangkok, a Member of Design Hotels","249 Soi Sukhumvit 31, Khlong Tan Nuea, Watthan..."
...,...,...,...
1620,https://www.booking.com/hotel/th/banthat-thong...,บรรทัดทองโฮสเทล,"1806/16-17 ถ.บรรทัดทอง รองเมือง ปทุมวัน, ปทุมว..."
1621,https://www.booking.com/hotel/th/vaeh-viean-ho...,Vaeh Viean Silom,"23 Pramot Alley, บางรัก, 10500 กรุงเทพมหานคร, ไทย"
1622,https://www.booking.com/hotel/th/paradiso-bout...,พาราดิโซบูติคสวีท,"1/11-12 สุขุมวิท 10, คลองเตย, 10110 กรุงเทพมหา..."
1623,https://www.booking.com/hotel/th/silom-space-h...,Silom Space Hostel,"8 Silom Road Soi 2, Bangrak, บางรัก, 10500 กรุ..."
