In [7]:
import os
import requests
import pandas as pd
import time
import random

# Cấu hình

In [8]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.9'
}
IMAGE_DIR = 'images'
os.makedirs(IMAGE_DIR, exist_ok=True)
MAX_PAGES = 2
PRODUCTS_PER_PAGE = 40
DELAY_RANGE = (1.0, 2.5)

# Danh sách mục cào

In [9]:
CATEGORIES = [
    {"id": 1795, "name": "Smart Phone"},
    {"id": 1805, "name": "Speaker"},
    {"id": 1811, "name": "Headphones"},
    {"id": 1821, "name": "Power Bank"},
    {"id": 3428, "name": "Gaming Mouse"},
    {"id": 8039, "name": "Smart Watch"},
]

# Tải ảnh

In [10]:
def download_image(image_url, product_id, category_dir):
    if not image_url or not product_id:
        return ""
    if image_url.startswith("//"):
        image_url = "https:" + image_url
    try:
        response = requests.get(image_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        filename = f"{product_id}.jpg"
        filepath = os.path.join(category_dir, filename)
        with open(filepath, "wb") as f:
            f.write(response.content)
        return os.path.join(os.path.basename(category_dir), filename)  # relative path
    except Exception as e:
        print(f"⚠️ Error downloading image for {product_id}: {e}")
        return ""

# Cào sản phẩm từ 1 danh mục

In [11]:
def crawl_category(category_id, category_name):
    records = []
    category_dir = os.path.join(IMAGE_DIR, category_name.replace(" ", "_"))
    os.makedirs(category_dir, exist_ok=True)

    for page in range(1, MAX_PAGES + 1):
        print(f"🔎 Crawling '{category_name}' - Page {page}")
        url = "https://tiki.vn/api/personalish/v1/blocks/listings"
        params = {
            "limit": PRODUCTS_PER_PAGE,
            "category": category_id,
            "page": page,
            "sort_by": "popularity"
        }

        try:
            response = requests.get(url, headers=HEADERS, params=params, timeout=15)
            if response.status_code != 200:
                print(f"❌ Error {response.status_code}")
                break
            data = response.json().get("data", [])
            if not data:
                print("⛔ No more products.")
                break

            for item in data:
                product_id = str(item.get("id", ""))
                name = item.get("name", "no_name")
                price = item.get("price", 0)
                image_url = item.get("thumbnail_url", "")
                image_file = download_image(image_url, product_id, category_dir)

                records.append({
                    "product_id": product_id,
                    "category": category_name,
                    "product_name": name,
                    "price": price,
                    "image": image_file,
                    "image_url": image_url,
                    "source": f"https://tiki.vn/{item.get('url_path', '')}"
                })

            time.sleep(random.uniform(*DELAY_RANGE))

        except Exception as e:
            print(f"🚨 Exception: {e}")
            break
    return records

# Chạy và lưu file

In [12]:
if __name__ == "__main__":
    start_time = time.time()

    all_products = []
    for cat in CATEGORIES:
        products = crawl_category(cat["id"], cat["name"])
        all_products.extend(products)

    # Export to Excel
    df = pd.DataFrame(all_products)
    df.to_excel("data.xlsx", index=False)

    end_time = time.time()
    elapsed = end_time - start_time

    print(f"\n✅ Saved {len(all_products)} products to 'data.xlsx'")
    print(f"⏱️ Total crawl time: {elapsed:.2f} seconds")

🔎 Crawling 'Smart Phone' - Page 1
🔎 Crawling 'Smart Phone' - Page 2
🔎 Crawling 'Speaker' - Page 1
🔎 Crawling 'Speaker' - Page 2
🔎 Crawling 'Headphones' - Page 1
🔎 Crawling 'Headphones' - Page 2
🔎 Crawling 'Power Bank' - Page 1
🔎 Crawling 'Power Bank' - Page 2
🔎 Crawling 'Gaming Mouse' - Page 1
🔎 Crawling 'Gaming Mouse' - Page 2
🔎 Crawling 'Smart Watch' - Page 1
🔎 Crawling 'Smart Watch' - Page 2

✅ Saved 480 products to 'data.xlsx'
⏱️ Total crawl time: 57.87 seconds
