In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import List, Dict, Optional

In [None]:
class TikiAPIScraper:
    def __init__(self, base_url: str, query: str, max_pages: Optional[int] = None, limit_per_page: Optional[int] = 24):
        self.base_url = base_url
        self.query = query
        self.max_pages = max_pages
        self.limit_per_page = limit_per_page
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }
        self.products_data = []

    def fetch_data(self, page: int):
        """Fetch product data from Tiki API."""
        api_url = f"{self.base_url}?limit={self.limit_per_page}&q={self.query}&page={page}"
        try:
            response = requests.get(api_url, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            return data
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            return None

    def get_total_pages(self):
        """Get the total number of pages from the API."""
        data = self.fetch_data(1)
        if data and "paging" in data:
            total_products = data["paging"].get("total", 0)
            products_per_page = data["paging"].get("per_page", 24)
            total_pages = (total_products // products_per_page) + (total_products % products_per_page > 0)
            print(f"Total pages available: {total_pages}")
            return total_pages
        return 1

    def scrape(self):
        """Scrape product data across pages."""
        if self.max_pages is None:
            self.max_pages = self.get_total_pages()

        page = 1
        while page <= self.max_pages:
            # In ra link của từng trang mà scraper truy cập
            api_url = f"{self.base_url}?limit=24&q={self.query}&page={page}"
            print(f"Fetching URL: {api_url}")

            response_data = self.fetch_data(page)

            if not response_data or "data" not in response_data:
                print(f"No more products found on page {page}. Stopping scrape.")
                break

            products = response_data["data"]
            for product in products:
                quantity_sold = product.get("quantity_sold", {}).get("value", 0)
                product_info = {
                    "Tên sản phẩm": product.get("name", "N/A"),
                    "Tên cửa hàng": product.get("seller_name", "N/A"),
                    "Giá": product.get("price", "N/A"),
                    "Thương hiệu": product.get("brand_name", "N/A"),
                    "Lượt đánh giá": product.get("review_count", "N/A"),
                    "Điểm đánh giá trung bình": product.get("rating_average","N/A"),
                    "Số lượng đã bán": quantity_sold,
                    "Link": f"https://tiki.vn/{product.get('url_path', '')}"
                }
                self.products_data.append(product_info)

            # Delay 2s before fetching the next page
            page += 1
            time.sleep(2)

        print(f"Total products scraped: {len(self.products_data)}")

    def save_to_csv(self, filename: str):
        """Save the scraped data to a CSV file."""
        df = pd.DataFrame(self.products_data)
        df.to_csv(filename, sep=',', index=False, encoding='utf-8')
        print(f"Data saved to {filename}")


In [None]:

if __name__ == "__main__":
    base_url = "https://tiki.vn/api/v2/products"
    limit_per_page = 48
    max_pages = None  # None to scrape all pages, or specify the number of pages to scrape
    
    search_keywords = ["thời trang", "quần", "áo", "váy"]
    brandedStores = []

    for keyword in search_keywords:
        scraper = TikiAPIScraper(base_url, keyword, max_pages, limit_per_page)
        scraper.scrape()
        scraper.save_to_csv(f"./Data/tiki_products_{keyword}.csv")



Total pages available: 50
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=1
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=2
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=3
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=4
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=5
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=6
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=7
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=8
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=9
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=10
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=11
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&page=12
Fetching URL: https://tiki.vn/api/v2/products?limit=24&q=thời trang&pag

In [4]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# import time

# class ShopeeSeleniumScraper:
#     def __init__(self, base_url="https://shopee.vn/mall", delay=5, limit=10):
#         """
#         Initialize the scraper with the base URL, delay between actions, and limit for categories.
#         :param base_url: Shopee Mall URL.
#         :param delay: Time delay (in seconds) to wait for the content to load.
#         :param limit: Maximum number of categories to scrape.
#         """
#         self.base_url = base_url
#         self.delay = delay
#         self.limit = limit
#         self.driver = webdriver.Chrome()  # Initialize Chrome WebDriver

#     def scrape(self):
#         """
#         Scrape categories from Shopee Mall using Selenium.
#         :return: List of categories with names and URLs.
#         """
#         print("Opening Shopee Mall page...")
#         self.driver.get(self.base_url)
#         time.sleep(self.delay)  # Wait for the content to load

#         categories = []
#         try:
#             # Locate the categories in the "Danh mục" section
#             category_elements = self.driver.find_elements(By.CSS_SELECTOR, "ul.image-carousel__item-list li.image-carousel__item a")
#             for i, category in enumerate(category_elements):
#                 if i >= self.limit:  # Stop after reaching the limit
#                     break
#                 name = category.find_element(By.CSS_SELECTOR, "div.gRCeBZ").text.strip()  # Category name
#                 link = category.get_attribute("href")  # Category link
#                 categories.append({"name": name, "href": link})
#         except Exception as e:
#             print(f"Error scraping categories: {e}")
#         finally:
#             self.driver.quit()  # Close the browser

#         return categories

# # Example usage
# if __name__ == "__main__":
#     scraper = ShopeeSeleniumScraper(delay=5, limit=10)  # Adjust delay and limit as needed
#     categories = scraper.scrape()
#     print("Categories:", categories)

