### Please install these libraries below to avoid error when running the scripts

- pip install webdriver-manager
- pip install beautifulsoup4
- pip install lxml
- pip install selenium
- pip install pandas
- pip install matplotlib
- pip install numpy

In [2]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

### Get Price and Total sale of product
- We will get sales of each product by removing unwanted characters and keeping only numbers. 
- **'k'** character in sale number means 1000, so 20k means 20000 products have been sold 
- Any product does not have sales record will be replaced with 0

In [3]:
def getPrice(prices):
    price_list=[]
    for pri in prices:
        # Remove all special character
        price_product=pri.text
        price_product=price_product.replace('₫','').replace('.','').replace(' ','')
        
        # If product has 2 different price tags then splits and take the lowest
        price_product=price_product.split('-')
        price_product=price_product[0]
        price_list.append(price_product)
    
    # Remove empty element at the end of the list
    if(len(price_list)>1):
        price_list=price_list[0:-1]
    
    # If product has no discounted price (length=1) then add the normal price to it
    if(len(price_list)==1):
        price_list.append(price_list[0])
    
    return (price_list[0],price_list[1])

In [None]:
def getSell(text):
    text=text.replace(',','.')
    if('k' in text):
        text=re.sub('[^\d\.]', '', text)
        text=float(text)*1000
    elif(text==''):
        return 0
    else:
        text=re.sub('[^\d\.]', '', text)
        text=float(text)
    return int(text)

### Main function

In [16]:
def scrape_shopee_products(url):
    # create a new Chrome session
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
    driver.get(url)

    # Get total page
    delay = 10 #seconds
    try:
        myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located
                                                    ((By.XPATH, "//span[@class='shopee-mini-page-controller__total']")))
        print("Found total page")
    except TimeoutException:
        print("Error")
            
    soup = BeautifulSoup(driver.page_source)
    total_page=soup.select_one('span[class="shopee-mini-page-controller__total"]').text
    total_page=int(total_page)

    # Get url for each shopee page
    url_list=[]
    template=url+'?page={0}&sortBy=pop'
    for i in range(total_page):
        url=template.format(i)
        url_list.append(url)

    print('Total page: ',total_page)

    # Loop through each page and scrape all products
    name_list=[]
    price_list=[]
    discounted_price_list=[]
    sell_list=[]
    url_list=[]
    current_page=1

    for url in url_list:
        driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
        driver.get(url)

        delay = 10 #seconds
        try:
            myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located
                                                        ((By.XPATH, "//div[@class='shop-search-result-view__item col-xs-2-4']")))
            print("Page is ready!")
        except TimeoutException:
            print("Loading took too much time!")

        soup = BeautifulSoup(driver.page_source)
        products_selector=soup.select('div[class="shop-search-result-view__item col-xs-2-4"]')
        print("Current page: ",current_page)
        current_page+=1
        print("total products in this page",len(products_selector))

        for prod in products_selector:
            # Get name of product
            name=prod.select_one('div[class="_3Gla5X _2j2K92 _3j20V6"]')

            # Price and discounted price if product has
            prices_selector=prod.select_one('div[class="_3CsOH6"]')
            normal_price,discounted_price=getPrice(prices_selector)

            # Total sell
            total_sell=prod.select_one('div[class="_3UeJ1q"]')

            name_list.append(name.text)
            price_list.append(normal_price)
            discounted_price_list.append(discounted_price)
            sell_list.append(getSell(total_sell.text))

        # Create dataframe based on scarped products
        df = pd.DataFrame(columns=['name','price','discount_price','total_sell'])
        df['name']=name_list
        df['price']=price_list
        df['discount_price']=discounted_price_list
        df['total_sell']=sell_list
        
        # Calculate revenue
        df['price']=df['price'].astype(int)
        df['discount_price']=df['discount_price'].astype(int)
        df['revenue']=df['price']*df['total_sell']
        
        df.to_csv('product.csv',encoding='utf-8-sig',index=False)
    return df

In [17]:
# Change URL to any shopee store you like, the program will automatically scrape it
url = 'https://shopee.vn/coolmate.vn'
df=scrape_shopee_products(url)

  driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)


Found total page
Total page:  8


  driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)


Page is ready!
Current page:  1
total products in this page 30
Page is ready!
Current page:  2
total products in this page 30
Page is ready!
Current page:  3
total products in this page 30
Page is ready!
Current page:  4
total products in this page 30
Page is ready!
Current page:  5
total products in this page 30
Page is ready!
Current page:  6
total products in this page 30
Page is ready!
Current page:  7
total products in this page 30
Page is ready!
Current page:  8
total products in this page 4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            214 non-null    object
 1   price           214 non-null    int32 
 2   discount_price  214 non-null    int32 
 3   total_sell      214 non-null    int64 
 4   revenue         214 non-null    int64 
dtypes: int32(2), int64(2), object(1)
memory usage: 6.8+ KB


In [None]:
df.info()
df.head(5)