In [2]:
import csv, re, time, random
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [3]:
BASE = "https://mogi.vn"
base_URL = "https://mogi.vn/ha-noi/mua-nha-dat"   # ví dụ: danh mục gốc

In [5]:
session = requests.Session()
retry = Retry(
    total=5, connect=3, read=3, status=5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET", "HEAD"],
    backoff_factor=1.2,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50)
session.mount("http://", adapter); session.mount("https://", adapter)
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124 Safari/537.36",
    "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8"
})

def bs(html):
    try:
        return BeautifulSoup(html, "lxml")
    except Exception:
        return BeautifulSoup(html, "html.parser")

Links = []
empty_streak = 0

for page in range(1, 400):
    url = f"{base_URL}?cp={page}"
    try:
        resp = session.get(url, timeout=25)
    except Exception as e:
        print("List GET error:", e)
        time.sleep(random.uniform(0.6, 1.2))
        continue

    if resp.status_code != 200 or not resp.text:
        print("List bad status:", resp.status_code)
        time.sleep(random.uniform(0.6, 1.2))
        continue

    soup = bs(resp.text)
    anchors = soup.find_all("a", class_="link-overlay")
    if not anchors:
        empty_streak += 1
        if empty_streak >= 3:
            print(f"Không thấy tin trong {empty_streak} trang liền → dừng ở page={page}.")
            break
    else:
        empty_streak = 0

    for a in anchors:
        href = a.get("href")
        if href:
            Links.append(urljoin(BASE, href))  # luôn thành URL tuyệt đối

    time.sleep(random.uniform(0.6, 1.2))

# dedupe
Links = list(dict.fromkeys(Links))
print("Collected detail links:", len(Links))

Data = []
for link in Links:
    try:
        res = session.get(link, timeout=10)
    except Exception as e:
        print("Detail GET error:", e, link)
        time.sleep(random.uniform(0.6, 1.2))
        continue

    if res.status_code != 200 or not res.text:
        print("Detail bad status:", res.status_code, link)
        time.sleep(random.uniform(0.6, 1.2))
        continue

    soup = bs(res.text)
    data = {"url": link}

    # breadcrumb an toàn
    bc = soup.find('ul', class_="breadcrumb clearfix")
    if bc:
        spans = bc.find_all('span')
        if len(spans) > 1: data['loai_hinh'] = spans[1].get_text(strip=True)
        if len(spans) > 4: data['title']     = spans[4].get_text(strip=True)

    addr = soup.find('div', class_="address")
    data['address'] = addr.get_text(strip=True) if addr else None
    price = soup.find('div', class_="price")
    data['price'] = price.get_text(strip=True) if price else None

    for info in soup.find_all("div", class_="info-attr clearfix"):
        spans = info.find_all('span')
        if len(spans) >= 2:
            key = spans[0].get_text(strip=True)
            val = spans[1].get_text(strip=True)
            if key and val:
                data[key] = val

    agent_name = None; link_href = None; phone_raw = None

    agent = soup.find('div', class_="agent-widget widget")
    if agent:
        a = agent.find('a', href=True)
        if a:
            agent_name = a.get_text(strip=True)
            link_href  = urljoin(BASE, a['href'])

    span = soup.find("span", attrs={"ng-bind": re.compile(r"PhoneFormat\(")})
    if span and span.has_attr("ng-bind"):
        m = re.search(r"PhoneFormat\('(\d{8,12})'\)", span["ng-bind"])
        if m:
            phone_raw = m.group(1)

    data['mo_gioi_ten']  = agent_name
    data['mo_gioi_link'] = link_href
    data['mo_gioi_phone'] = phone_raw

    Data.append(data)
    time.sleep(random.uniform(0.5, 1))

# ghi CSV
fieldnames = sorted({k for d in Data for k in d.keys()})
import csv
with open('batdongsan1.csv', 'w', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader(); w.writerows(Data)

print(f"Đã lưu {len(Data)} dòng vào batdongsan1.csv")

Collected detail links: 5965
Đã lưu 5965 dòng vào batdongsan1.csv


In [6]:
Links

['https://mogi.vn/quan-hai-ba-trung/mua-nha-hem-ngo/ban-nha-pho-vong-hbt-38m2-mt-3-7m-gia-nhinh-6-ty-oto-do-cua-id22658178',
 'https://mogi.vn/quan-cau-giay/mua-nha-mat-tien-pho/ban-nha-pho-doan-ke-thien-cau-giay-dt-60m2-xay-7-tang-thang-may-id22381248',
 'https://mogi.vn/quan-cau-giay/mua-nha-mat-tien-pho/ban-toa-apartment-tran-thai-tong-2-thoang-120m2-x-9-tang-48-ty-id22668290',
 'https://mogi.vn/quan-long-bien/mua-nha-hem-ngo/ban-nha-thach-ban-30m-x-5-tang-ngo-thong-rong-gan-3m-gia-3-5-ty-id22535596',
 'https://mogi.vn/huyen-gia-lam/mua-nha-hem-ngo/ban-gap-3-tang-x65m-giap-hai-ecopark-da-ton-nha-4pn-2wc-ngo-thoang-id22698617',
 'https://mogi.vn/quan-nam-tu-liem/mua-can-ho-chung-cu/ban-can-ho-chung-cu-khu-do-thi-me-tri-ha-id22712806',
 'https://mogi.vn/quan-thanh-xuan/mua-nha-mat-tien-pho/ban-ccmn-trieu-khuc-100m2-x8t-thang-may-24p-khep-kin-120tr-thang-id22657908',
 'https://mogi.vn/quan-ba-dinh/mua-dat-tho-cu/ban-dat-pho-truc-bach-xay-khach-san-van-phong-310m2-mt-10m-146-ty-id226582