# Web Scraping dari Female Daily

- Sumber            : Female Daily
- Toko              : Wardah
- Total produk      : 50 Produk
- Total page/produk : 10 Page (Max)

In [2]:
import os
import csv
import json
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
class FemaleDaily_Scraper:
    def __init__(self, headless=True, delay=2):
        self.delay = delay
        self.base_url = "https://reviews.femaledaily.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        chrome_options = Options()
        if headless:
            chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        self.wait = WebDriverWait(self.driver, 10)
        
    def __del__(self):
        #Cleanup driver saat object dihapus
        if hasattr(self, 'driver'):
            self.driver.quit()
    
    def get_product_links(self, brand_url, max_products=None):
        # Mengambil semua link produk dari halaman brand dengan pagination
        product_links = []
        
        self.driver.get(brand_url)
        time.sleep(self.delay)
        
        while True:
            try:
                self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "product-card")))
            except TimeoutException:
                print("Timeout menunggu produk card")
                break
            
            # Ambil semua link produk di halaman utama
            product_cards = self.driver.find_elements(By.CSS_SELECTOR, "a.product-card")
            
            for card in product_cards:
                href = card.get_attribute('href')
                if href and href not in product_links:
                    product_links.append(href)
                    print(f"Found product: {href}")
                    
                    if max_products and len(product_links) >= max_products:
                        return product_links
            
            # Paginasi halaman utama
            try:
                load_more_btn = self.driver.find_element(By.ID, "button-load-more-products")
                if load_more_btn.is_enabled():
                    self.driver.execute_script("arguments[0].click();", load_more_btn)
                    time.sleep(self.delay)
                    print(f"Loaded more products. Total so far: {len(product_links)}")
                else:
                    print("Load more button disabled, reached end")
                    break
            except NoSuchElementException:
                print("No more products to load")
                break
        
        return product_links
    
    def extract_review_data(self, review_element):
        # Extract data dari satu review element
        review_data = {}
        
        try:
            # Username
            username_elem = review_element.find_element(By.CSS_SELECTOR, ".profile-username a")
            review_data['username'] = username_elem.text.strip()
            review_data['user_profile_url'] = username_elem.get_attribute('href')
        except NoSuchElementException:
            review_data['username'] = ""
            review_data['user_profile_url'] = ""
        
        try:
            # Age
            age_elem = review_element.find_element(By.CSS_SELECTOR, ".profile-age")
            review_data['user_age'] = age_elem.text.strip()
        except NoSuchElementException:
            review_data['user_age'] = ""
        
        try:
            # Review date
            date_elem = review_element.find_element(By.CSS_SELECTOR, ".review-date")
            review_data['review_date'] = date_elem.text.strip()
        except NoSuchElementException:
            review_data['review_date'] = ""
        
        try:
            # Rating (count filled stars)
            filled_stars = review_element.find_elements(By.CSS_SELECTOR, ".cardrv-starlist .icon-ic_big_star_full")
            review_data['rating'] = len(filled_stars)
        except NoSuchElementException:
            review_data['rating'] = 0
        
        try:
            # Recommendation
            recommend_elem = review_element.find_element(By.CSS_SELECTOR, ".recommend b")
            review_data['is_recommended'] = "recommends" in recommend_elem.text.lower()
        except NoSuchElementException:
            review_data['is_recommended'] = False
        
        try:
            # Review text
            text_elem = review_element.find_element(By.CSS_SELECTOR, ".text-content span")
            review_data['review_text'] = text_elem.text.strip()
        except NoSuchElementException:
            review_data['review_text'] = ""
        
        try:
            # Usage period
            usage_elem = review_element.find_element(By.XPATH, ".//p[contains(., 'Usage Period')]/b")
            review_data['usage_period'] = usage_elem.text.strip()
        except NoSuchElementException:
            review_data['usage_period'] = ""
        
        try:
            # Purchase point
            purchase_elem = review_element.find_element(By.XPATH, ".//p[contains(., 'Purchase Point')]/b")
            review_data['purchase_point'] = purchase_elem.text.strip()
        except NoSuchElementException:
            review_data['purchase_point'] = ""
        
        return review_data
    
    def get_product_reviews(self, product_url, max_reviews=None, max_pages=10):
        # Mengambil review dari halaman produk
        self.driver.get(product_url)
        time.sleep(self.delay)
        
        product_data = {
            'product_url': product_url,
            'product_name': '',
            'brand_name': '',
            'overall_rating': '',
            'total_reviews': '',
            'reviews': []
        }
        
        try:
            # Extract product info
            product_name_elem = self.driver.find_element(By.CSS_SELECTOR, "h1, .product-title")
            product_data['product_name'] = product_name_elem.text.strip()
        except NoSuchElementException:
            pass
        
        try:
            # Extract brand name
            brand_elem = self.driver.find_element(By.CSS_SELECTOR, ".brand-name, .product-brand")
            product_data['brand_name'] = brand_elem.text.strip()
        except NoSuchElementException:
            pass
        
        current_page = 1
        consecutive_empty_pages = 0
        max_empty_pages = 3
        
        print(f"Maksimum {max_pages} halaman dari produk")
        
        while current_page <= max_pages:
            print(f"Scraping reviews page {current_page} for {product_data.get('product_name', 'Unknown Product')}")
            
            # Tunggu reviews load dengan timeout lebih panjang
            try:
                self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".review-card")))
                consecutive_empty_pages = 0  # Reset counter jika ada review
            except TimeoutException:
                print(f"No reviews found on page {current_page}")
                consecutive_empty_pages += 1
                
                # Jika sudah beberapa halaman kosong berturut-turut, berhenti
                if consecutive_empty_pages >= max_empty_pages:
                    print(f"Reached {max_empty_pages} consecutive empty pages, stopping")
                    break
                
                # Coba lanjut ke halaman berikutnya
                try:
                    next_btn = self.driver.find_element(By.ID, "id_next_page")
                    if "paging-prev-text-active" in next_btn.get_attribute("class"):
                        self.driver.execute_script("arguments[0].click();", next_btn)
                        current_page += 1
                        time.sleep(self.delay)
                        continue
                    else:
                        break
                except NoSuchElementException:
                    break
            
            # Extract semua review di halaman ini
            review_elements = self.driver.find_elements(By.CSS_SELECTOR, ".item .review-card")
            
            if not review_elements:
                print(f"No review elements found on page {current_page}")
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= max_empty_pages:
                    break
            else:
                consecutive_empty_pages = 0
            
            page_reviews_count = 0
            for review_elem in review_elements:
                try:
                    review_data = self.extract_review_data(review_elem)
                    review_data['page_number'] = current_page
                    product_data['reviews'].append(review_data)
                    page_reviews_count += 1
                    
                    # Jika ada batasan maksimal review dan sudah tercapai
                    if max_reviews and len(product_data['reviews']) >= max_reviews:
                        print(f"Reached maximum reviews limit: {max_reviews}")
                        return product_data
                        
                except Exception as e:
                    print(f"Error extracting review: {str(e)}")
                    continue
            
            print(f"Extracted {page_reviews_count} reviews from page {current_page}. Total: {len(product_data['reviews'])}")
            
            has_next_page = False
            
            # Coba berbagai selector untuk tombol next
            next_selectors = [
                "#id_next_page",
                ".pagination-next",
                ".next-page",
                "a[aria-label='Next']",
                ".paging-next"
            ]
            
            for selector in next_selectors:
                try:
                    next_btn = self.driver.find_element(By.CSS_SELECTOR, selector)
                    
                    # Cek apakah tombol aktif
                    if (next_btn.is_enabled() and 
                        "disabled" not in next_btn.get_attribute("class") and
                        "paging-prev-text-active" in next_btn.get_attribute("class")):
                        
                        print(f"Klik button paginasi: {selector}")
                        self.driver.execute_script("arguments[0].click();", next_btn)
                        current_page += 1
                        time.sleep(self.delay)
                        has_next_page = True
                        break
                        
                except NoSuchElementException:
                    continue
            
            if not has_next_page:
                print("Halaman habis")
                break
            
            # MODIFIKASI: Cek apakah sudah mencapai batas maksimal halaman
            if current_page > max_pages:
                print(f"Maksimum limit halaman ({max_pages}), stop paginasi")
                break
        
        final_count = len(product_data['reviews'])
        print(f"Scraping selesai. Total review diambil: {final_count}")
        
        return product_data
    
    def scrape_brand_reviews(self, brand_url, max_products=None, max_reviews_per_product=None, max_pages_per_product=10, output_file=None):
        # Scrape review dari produk dalam brand
        print(f"Starting scraping for brand: {brand_url}")
        
        if max_reviews_per_product is None:
            print(f"Scraping review maksimal {max_pages_per_product} pages per product")
        else:
            print(f"Scraping sampai {max_reviews_per_product} reviews (max {max_pages_per_product} pages) per product")
        
        print("Mengambil link product...")
        product_links = self.get_product_links(brand_url, max_products)
        print(f"Found {len(product_links)} products")
        
        all_products_data = []
        total_reviews_scraped = 0
        
        for i, product_url in enumerate(product_links, 1):
            print(f"\n{'='*60}")
            print(f"Scraping product {i}/{len(product_links)}")
            print(f"URL: {product_url}")
            print(f"{'='*60}")
            
            try:
                product_data = self.get_product_reviews(product_url, max_reviews_per_product, max_pages_per_product)
                all_products_data.append(product_data)
                
                reviews_count = len(product_data['reviews'])
                total_reviews_scraped += reviews_count
                
                print(f"Berhasil scraping {reviews_count} review untuk '{product_data['product_name']}'")
                print(f"Running total: {total_reviews_scraped} review")
                
                # Progress tracking tanpa save file
                if i % 3 == 0:
                    print(f"Progress: {i} products completed")
                
                # Delay tambahan untuk menghindari rate limiting
                time.sleep(self.delay * 2)
                
            except Exception as e:
                print(f"Error scraping product {product_url}: {str(e)}")
                continue
        
        # Save final results - simplified to just use the filename as provided
        if output_file:
            self.save_to_csv(all_products_data, output_file)
        
        print(f"\nSCRAPING COMPLETED!")
        print(f"Total products: {len(all_products_data)}")
        print(f"Total reviews: {total_reviews_scraped}")
        
        return all_products_data
    
    def save_to_json(self, data, filename):
        # Save data ke file JSON untuk progress
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"JSON progress saved to {filename}")
    
    def save_to_csv(self, data, filename):
        # Save data ke file CSV - hanya dipanggil di akhir
        if not data:
            print("No data to save")
            return
        
        # Flatten data untuk CSV
        csv_data = []
        for product in data:
            for review in product['reviews']:
                row = {
                    'product_url': product['product_url'],
                    'product_name': product['product_name'],
                    'brand_name': product['brand_name'],
                    'overall_rating': product['overall_rating'],
                    'total_reviews': product['total_reviews'],
                    **review
                }
                csv_data.append(row)
        
        if csv_data:
            with open(filename, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=csv_data[0].keys())
                writer.writeheader()
                writer.writerows(csv_data)
            print(f"Final CSV data saved to {filename} with {len(csv_data)} reviews")

# Example usage untuk mengambil SEMUA review
if __name__ == "__main__":
    scraper = FemaleDaily_Scraper(headless=False, delay=3)
    
    try:
        brand_url = "https://reviews.femaledaily.com/brands/product/wardah"
        
        print("=" * 60)
        
        results = scraper.scrape_brand_reviews(
            brand_url=brand_url,
            max_products=50,  # 50 * 100 = 5000 dataset
            max_reviews_per_product=None,
            max_pages_per_product=10,
            output_file="female_daily_wardah_reviews.csv"
        )
        
        print("\n" + "=" * 60)
        print("SCRAPING COMPLETED!")
        print("=" * 60)
        print(f"Total product terambil: {len(results)}")
        
        total_reviews = sum(len(product['reviews']) for product in results)
        print(f"Total review: {total_reviews}")
        
    except Exception as e:
        print(f"Error selama scraping: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        print("\nCleaning up...")
        del scraper
        print("Done!")

Starting scraping for brand: https://reviews.femaledaily.com/brands/product/wardah
Scraping review maksimal 10 pages per product
Mengambil link product...
Found product: https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/wardah/sun-block-spf-33
Found product: https://reviews.femaledaily.com/products/treatment/skin-soothing-treatment/wardah/aloe-hydramild-multifunction-gel
Found product: https://reviews.femaledaily.com/products/face-2/bb-cc-cream/wardah/c-defense-dd-cream
Found product: https://reviews.femaledaily.com/products/eyes/eyeliner/wardah/eyexpert-optimum-hi-black-liner
Found product: https://reviews.femaledaily.com/products/face-2/bb-cc-cream/wardah/lightening-bb-cream-light
Found product: https://reviews.femaledaily.com/products/lips/lipstick/wardah/exclusive-matte-lip-cream-03-see-you-latte
Found product: https://reviews.femaledaily.com/products/makeup-remover/face/wardah/nature-daily-calm-soothe-micellar-water
Found product: https://reviews.femaledaily.co