In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

In [None]:
class UniqloAPIScraper:
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }
        self.products_data = []

    def fetch_data(self, page: int):
        """Fetch product data from Uniqlo's API for a specific page."""
        offset = (page - 1) * 10  # API paginates by 10 items per page
        api_url = f"{self.base_url}?offset={offset}&limit=10"

        try:
            response = requests.get(api_url, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            # Extract products from the `result` field
            products = data.get("result", {}).get("items", [])
            return products
        except requests.RequestException as e:
            print(f"Error fetching data for page {page}: {e}")
            return []

    def scrape(self):
        """Collect product data across multiple pages."""
        page = 1
        while True:
            print(f"Fetching data for page {page}...")
            products = self.fetch_data(page)

            if not products:  # Stop if no products are found on the page
                print(f"No products found on page {page}. Stopping collection.")
                break

            for product in products:
                product_info = {
                    "Product ID": product.get("productId", "N/A"),
                    "Product Name": product.get("name", "N/A"),
                    "Price": product.get("prices", {}).get("base", {}).get("value", "N/A"),
                    "Currency": product.get("prices", {}).get("base", {}).get("currency", {}).get("code", "N/A"),
                    #"Colors": [color.get("name", "N/A") for color in product.get("colors", [])],
                    "URL": f'https://www.uniqlo.com/vn/vi/products/{product.get("productId", "N/A")}?colorCode={product.get("colors", [{}])[0].get("code", "N/A")}&sizeCode={product.get("sizes", [{}])[0].get("code", "N/A")}'
                }

                # Get the review and rating information
                review_url = f"{self.base_url}/{product.get('productId', 'N/A')}/reviews?offset=0&limit=5"
                review_response = requests.get(review_url, headers=self.headers)
                
                rateCount = {"one": 0, "two": 0, "three": 0, "four": 0, "five": 0}
                if review_response.status_code == 200:
                    review_data = review_response.json()
                    rating_info = review_data.get("result", {}).get("rating", {})
                    product_info["Rating"] = rating_info.get("average", "N/A")
                    product_info["Total Ratings"] = rating_info.get("count", "N/A")
                    product_info["Fit"] = rating_info.get("fit", "N/A")
                    rateCount = rating_info.get("rateCount", {"one": 0, "two": 0, "three": 0, "four": 0, "five": 0}) 
                    product_info["Rating Count"] = rateCount

                self.products_data.append(product_info)

            page += 1

            # Add a delay to avoid overloading the server
            time.sleep(2)

        print(f"Collection complete. Total products collected: {len(self.products_data)}")

    def save_to_csv(self, filename: str):
        """Save the collected data to a CSV file."""
        if not self.products_data:
            print("No data to save!")
            return
        df = pd.DataFrame(self.products_data)
        df.to_csv(filename, sep=',', index=False, encoding='utf-8')
        print(f"Data has been saved to {filename}")

In [None]:
if __name__ == "__main__":
    base_url = "https://www.uniqlo.com/vn/api/commerce/v3/vi/products"
    
    scraper = UniqloAPIScraper(base_url)
    scraper.scrape()
    scraper.save_to_csv("./Data/uniqlo_products.csv")