# Preparation: Import packages

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
import json
import numpy as np
import pandas as pd

# Preparation: Given the product link, scape the product info we need.
    - product info includes product name, product id, price, size, love counts, review counts, etc...
    - Save the product info as a dictionary

In [None]:
def get_data(product_link):
    """
    Given a product link, return a dictionary of product info 
    including product id, product name, brand, category, item 
    number, price, size, love counts, review counts, rating and link
    """
    response = requests.get(product_link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    product_id = re.findall(R'P[0-9]{3,6}', product_link)[0]

    # Some product may have been removed.
    if soup.find('h1', class_='css-56434t'):
        return None

    # Get brand name and product name
    for brand_and_name in soup.find_all('h1', class_='css-a1jw00'):
        names = [names.get_text() for names in brand_and_name.find_all('span')]
        brand = names[0]
        prd_name = names[1]

    # Get Category and price (use json to get categories and price),
    # Because it would give us sub categories and offer price.
    # But if that doesn't work, just get main category and price box.
    dic_json = soup.find_all(attrs={"type": "application/ld+json"})
    try:
        json_category = json.loads(dic_json[0].get_text())
        price = json.loads(dic_json[1].get_text())['offers'][0]['price']
        category_lst = [item['item']['name']
                        for item in json_category['itemListElement']]
        category = '/'.join(category_lst)

    except:
        try:
            category = [cate.get_text()
                        for cate in soup.find_all('ol', class_='css-1doqpel')]
            price = soup.find(
                'div', attrs={"data-comp": "Price Box"}).get_text()
        except:
            price = 'na'
            category = 'na'

    # Get item number and size
    size_and_item = soup.find(attrs={"data-comp": "SizeAndItemNumber Box"})
    # If there's only item number and no size info, it will be out of index
    try:
        item_num = size_and_item.contents[1].split(' ')[1]
    except:
        item_num = size_and_item.contents[0].split(' ')[1]
    # If there's no size at SizeAndItemNumber Box, then try use description area to see.
    try:
        size = size_and_item.span.contents[0].split('SIZE ')[1]
    except:
        try:
            size = soup.find(
                "span", attrs={"class": "css-12wl10d"}).contents[-1]
        except:
            size = 'na'

    # Get love counts
    try:
        love_counts = soup.find(
            'span', attrs={"data-at": "product_love_count"}).get_text()
    except:
        love_counts = 'na'

    # review nums and ratings
    link_json = soup.find(attrs={"id": "linkJSON"})
    json_str = str(link_json)
    ratings = re.findall(R'\"rating\"\:(.*?)\,', json_str)
    reviews = re.findall(R'\"reviews\"\:(.*?)\,', json_str)
    try:
        rating = ratings[0]
    except:
        rating = 'na'
    try:
        reviews_count = reviews[0]
    except:
        reviews_count = 'na'

    dic1 = {}
    dic1['Product_Id'] = product_id
    dic1['product_name'] = prd_name
    dic1['item_num'] = item_num
    dic1['brand'] = brand
    dic1['category'] = category
    dic1['price'] = price
    dic1['size'] = size
    dic1['love_count'] = love_counts
    dic1['rating'] = rating
    dic1['reviews_count'] = reviews_count
    dic1['link'] = product_link
    return dic1

In [None]:
# Test: get first 5 product info
products = open('product_link.txt')
i = 0
frame = []
for item in products:
    link = item.rstrip()
    product_data = get_data(link)
    if product_data:
        df = pd.DataFrame(product_data,index=[i])
        frame.append(df)
        i+=1
        time.sleep(np.random.random()*3)
    if i == 5:
        break
result = pd.concat(frame)
result

In [None]:
# How many products? 
products = open('product_link.txt')
count = 0
for item in products:
    count += 1
print('Product counts: ', count)

**There are over 3000 products on Sephora website. Therefore, to prevent blocking, here we have two ways: try different vpn or change proxies.**

# 1. Scape product info with different vpn.

## First time: use vpn to scape data

In [None]:
# 1st time: scape first 1500 products.
products = open('product_link.txt')
product_links = []
for item in products:
    link = item.rstrip()
    product_links.append(link)
    
frame1 = []
i = 0
for link in product_links[:1500]: 
    print(i, link)
    product_data = get_data(link)
    if product_data:
        df = pd.DataFrame(product_data,index=[i])
        frame1.append(df)
        i+=1
        time.sleep(np.random.random()*3)

result1 = pd.concat(frame1)
result1

In [None]:
result1.to_csv('result1.csv', index=False)

## Second time: change the vpn and run the cell below

In [None]:
# 2nd time: scape the left products.
frame2 = []
i = 1500 
for link in product_links[1500:]: 
    print(i, link)
    product_data = get_data(link)
    if product_data:
        df = pd.DataFrame(product_data,index=[i])
        frame2.append(df)
        i+=1
        time.sleep(np.random.random()*3)
result2 = pd.concat(frame2)
result2

# 2. Combine the results above and get the result

In [None]:
result = pd.concat([result1,result2])
result

In [None]:
## Save the result as csv. 
result.to_csv('result.csv', index=False)