In [1]:
import pandas as pd
import pickle
import json
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os
from datetime import datetime

  from pandas.core import (


In [3]:
# Headers for HTTP requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9', 
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive'
}

In [4]:
# Step 1: Load catalog page data
def load_catalog_data():
    """Load Sephora catalog page data from JSON files"""
    catalog_path = "config/sephora_pages/"
    catalog_data = []
    
    for file_name in os.listdir(catalog_path):
        if file_name.endswith('.json'):
            with open(os.path.join(catalog_path, file_name)) as f:
                catalog_data.append(json.load(f))
                
    return catalog_data

In [6]:
load_catalog_data()

[]

In [None]:
# Create DataFrame from catalog data
def create_product_df(catalog_data):
    """Create DataFrame containing product information from catalog pages"""
    return pd.concat([pd.json_normalize(page['products']) for page in catalog_data])

In [None]:
# Step 2: Get product details
def get_product_details(product_url, headers=HEADERS):
    """
    Get detailed product information from product page
    
    Parameters:
        product_url (str): URL of the product page
        headers (dict): HTTP request headers
        
    Returns:
        dict: Dictionary containing product details, or None if extraction fails
    """
    product_info = {
        'product_id': None,
        'size_and_item': None,
        'category': None, 
        'price': None,
        'love_count': None,
        'reviews_count': None,
        'swatch_images': [],
        'sku_color_mapping': {},
        'page_content': None
    }
    
    try:
        # Make HTTP request
        response = requests.get(product_url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract Product ID from URL
        try:
            product_info['product_id'] = re.findall(r'P[0-9]{3,6}', product_url)[0]
        except (IndexError, AttributeError):
            print(f"Could not extract product ID from {product_url}")

        # Extract Category Path
        try:
            category_element = soup.find_all(attrs={'data-comp': 'ProductBreadCrumbs BreadCrumbs '})[0]
            category_links = category_element.find_all('a')
            category_names = [link.string for link in category_links if link.string]
            product_info['category'] = ' > '.join(category_names)
        except (IndexError, AttributeError):
            print(f"Could not extract category for product {product_info['product_id']}")

        # Extract Size and Item Details
        try:
            size_element = soup.find(attrs={"data-at": "sku_size_label"})
            product_info['size_and_item'] = size_element.get_text() if size_element else None
        except AttributeError:
            print(f"Could not extract size info for product {product_info['product_id']}")

        # Extract Price
        try:
            price_element = soup.find_all(attrs={'data-comp': 'Price '})[0]
            product_info['price'] = price_element.get_text() if price_element else None
        except (IndexError, AttributeError):
            print(f"Could not extract price for product {product_info['product_id']}")

        # Extract Love Count
        try:
            love_element = soup.find('span', attrs={"class": "css-jk94q9"})
            product_info['love_count'] = love_element.get_text() if love_element else None
        except AttributeError:
            print(f"Could not extract love count for product {product_info['product_id']}")

        # Extract Review Count
        try:
            reviews_element = soup.find('span', attrs={'data-at': 'number_of_reviews'})
            product_info['reviews_count'] = reviews_element.get_text() if reviews_element else None
        except AttributeError:
            print(f"Could not extract review count for product {product_info['product_id']}")

        # Extract Swatch Images and Color Mapping
        try:
            swatch_groups = soup.find_all(attrs={'data-comp': 'SwatchGroup '})
            
            for group in swatch_groups:
                for button in group.find_all('button'):
                    # Get color name from aria-label
                    color_name = button.get('aria-label')
                    
                    # Get image and SKU info
                    img = button.find('img')
                    if img and 'src' in img.attrs:
                        product_info['swatch_images'].append(img['src'])
                        
                        # Extract SKU from image URL
                        sku_match = re.findall(r's[0-9]+', img['src'])
                        if sku_match and color_name:
                            sku_id = sku_match[0][1:]  # Remove 's' prefix
                            product_info['sku_color_mapping'][sku_id] = color_name
                            
        except Exception as e:
            print(f"Could not extract swatch info for product {product_info['product_id']}: {str(e)}")

        # Store full page content for potential future use
        product_info['page_content'] = soup
        
        return product_info

    except requests.exceptions.RequestException as e:
        print(f"Network error while fetching {product_url}: {str(e)}")
        return None
        
    except Exception as e:
        print(f"Unexpected error processing {product_url}: {str(e)}")
        return None

In [None]:
# Step 3: Download images 
def download_product_image(image_url, output_dir, headers=HEADERS):
    """Download product image from URL"""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Extract filename from URL
        image_name = image_url.split('/')[-1]
        image_path = os.path.join(output_dir, image_name)
        
        # Skip if image already exists
        if os.path.exists(image_path):
            return
            
        # Download image
        response = requests.get(image_url, headers=headers)
        if response.status_code == 200:
            with open(image_path, 'wb') as f:
                f.write(response.content)
                
    except Exception as e:
        print(f"Error downloading image {image_url}: {str(e)}")

In [None]:
def get_product_reviews(product_id, start_date=None, limit=100, max_retries=3):
    """
    Get product reviews using Bazaarvoice API
    
    Parameters:
        product_id (str): Product ID to get reviews for
        start_date (str): ISO format date string to filter reviews after this date
        limit (int): Number of reviews per request
        max_retries (int): Maximum number of retry attempts for failed requests
        
    Returns:
        tuple: (product_info, reviews_list)
    """
    
    # API Configuration
    API_CONFIG = {
        "host": "api.bazaarvoice.com",
        "token": "caHFIcND7how0aLS6wzoJhq0PcvkFllbfQUmnsxU3BMZo",
        "version": "5.4"
    }
    
    # Build API URL and base parameters
    url = f"https://{API_CONFIG['host']}/data/reviews.json"
    params = {
        'Filter': [f'ProductId:{product_id}'],
        'Sort': 'SubmissionTime:desc',
        'Limit': limit,
        'Offset': 0,
        'Include': 'Products,Comments',
        'Stats': 'Reviews',
        'passkey': API_CONFIG['token'],
        'apiversion': API_CONFIG['version'],
        'Locale': 'en_US'
    }
    
    # Add date filter if specified
    if start_date:
        timestamp = int(datetime.strptime(start_date, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp())
        params['Filter'].append(f'SubmissionTime:gt:{timestamp}')
    
    reviews = []
    product_info = []
    retry_count = 0
    
    try:
        while True:
            # Update offset for pagination
            params['Offset'] = len(reviews)
            
            # Make API request with retry logic
            for attempt in range(max_retries):
                try:
                    response = requests.get(url, params=params, timeout=15)
                    response.raise_for_status()  # Raise exception for bad status codes
                    break
                except requests.exceptions.RequestException as e:
                    if attempt == max_retries - 1:  # Last attempt
                        raise
                    time.sleep(2 ** attempt)  # Exponential backoff
                    continue
            
            # Parse response
            try:
                data = response.json()
            except json.JSONDecodeError as e:
                logger.error(f"Failed to decode JSON for product {product_id}: {str(e)}")
                break
                
            # Check for API errors
            if data.get('HasErrors', False):
                error_msg = data.get('Errors', ['Unknown error'])[0]
                logger.error(f"API error for product {product_id}: {error_msg}")
                break
                
            # Extract product information (only on first page)
            if len(reviews) == 0:
                try:
                    product_info = data['Includes']['Products'].get(product_id, {})
                except KeyError:
                    product_info = {}
            
            # Extract reviews
            new_reviews = data.get('Results', [])
            if not new_reviews:
                break
                
            # Process each review
            for review in new_reviews:
                processed_review = {
                    'review_id': review.get('Id'),
                    'product_id': product_id,
                    'rating': review.get('Rating'),
                    'title': review.get('Title'),
                    'review_text': review.get('ReviewText'),
                    'submission_time': review.get('SubmissionTime'),
                    'last_modified_time': review.get('LastModificationTime'),
                    'author': {
                        'name': review.get('UserNickname'),
                        'location': review.get('UserLocation'),
                    },
                    'is_verified_purchaser': review.get('IsVerifiedPurchaser', False),
                    'total_feedback_count': review.get('TotalFeedbackCount', 0),
                    'total_positive_feedback_count': review.get('TotalPositiveFeedbackCount', 0),
                    'total_negative_feedback_count': review.get('TotalNegativeFeedbackCount', 0),
                    'context_data_values': {
                        item['Id']: item.get('Value')
                        for item in review.get('ContextDataValues', [])
                    }
                }
                
                # Add photos if available
                if 'Photos' in review:
                    processed_review['photos'] = [
                        {
                            'id': photo.get('Id'),
                            'url': photo.get('Sizes', {}).get('normal', {}).get('Url'),
                            'caption': photo.get('Caption')
                        }
                        for photo in review.get('Photos', [])
                    ]
                
                reviews.append(processed_review)
            
            # Check if we have all reviews
            total_results = data.get('TotalResults', 0)
            if len(reviews) >= total_results:
                break
                
            # Rate limiting
            time.sleep(0.2)
            
        logger.info(f"Retrieved {len(reviews)} reviews for product {product_id}")
        return product_info, reviews
        
    except Exception as e:
        logger.error(f"Error retrieving reviews for product {product_id}: {str(e)}")
        return product_info, reviews

# Helper function to save reviews
def save_reviews(product_id, product_info, reviews, output_dir='data/raw/sephora_reviews/'):
    """Save product reviews to JSON file"""
    os.makedirs(output_dir, exist_ok=True)
    
    output_data = {
        'product_info': product_info,
        'reviews': reviews
    }
    
    filename = f"{product_id}_reviews.json"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

In [7]:
# Save product details to file
def save_product_details(product_details, output_dir='data/raw/sephora_product_pages/'):
    """Save product details to JSON file"""
    if not product_details or not product_details['product_id']:
        return
        
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert BeautifulSoup object to string to make it JSON serializable
    product_details['page_content'] = str(product_details['page_content'])
    
    filename = f"{product_details['product_id']}.json"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(product_details, f, ensure_ascii=False, indent=2)

In [None]:
# 1. Load catalog data and create product DataFrame
catalog_data = load_catalog_data()
product_df = create_product_df(catalog_data)

# 2. Get product details
for product_id in tqdm(product_df['productId'].unique()):
    product_url = f'https://www.sephora.com/product/{product_id}'
    product_details = get_product_details(product_url)
    if product_details:
        save_product_details(product_details)

# 3. Download images
image_urls = get_all_image_urls(product_df)
for url in tqdm(image_urls):
    download_product_image(url, 'data/raw/sephora_images/') 

# 4. Get reviews
for product_id in tqdm(product_df['productId']):
    product_info, reviews = get_product_reviews(product_id)