In [None]:
# Customized for crawling Xcite for device prices


# Dependencies

import html
import random
import re
import time
import json

import pandas as pd

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


# --- Menu crawling ---


# Obtains links from Xcite's navbar

def menu_crawl(main_URL, menu_items_list, menu_button) -> list:

    # Implement later: check if installed; if so, just declare; if not, install and declare
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(main_URL)

    HTML_master_block = []
    
    # Open menu
    dropdown_menu = driver.find_element_by_tag_name(menu_button)
    time.sleep(1)
    dropdown_menu.click()
    time.sleep(1)

    # Crawl through menu by h5 header and snapshot each page
    for category in menu_items_list:
        carrot = driver.find_element(By.XPATH, f"//h5[text()='{category}']")
        time.sleep(1)
        carrot.click()
        time.sleep(1)
        dropdown_menu.click()
        html_content = driver.page_source
        HTML_master_block.append(str(BeautifulSoup(html.unescape(html_content), 'html.parser')))

    driver.quit()
    
    HTML_master = ' '.join(HTML_master_block)
    
    pattern = r'"href":"\/([^"]+)"'  # Regular expression pattern
    matches = re.findall(pattern, HTML_master)
    HTML_list = list(set(matches))
    
    return HTML_list


# --- Product parsing ---

# Backend


# Filters graves matching Xcite's product class inheritance

def filter_graves(graveyard) -> list:
    
    # Step 2: Function to filter out only graves conforming to Xcite products class inheritance (ProductList_tileWrapper__cV7B_)
    filtered_graves = []
    for grave in graveyard:
        if 'ProductList_tileWrapper__cV7B_' in grave:
            filtered_graves.append(f"<li class{grave}")

    return filtered_graves


# Functional


# Creates a grave for a specific page

def grave_list(crawl_URL):
    
    # Step 1: Pull page source and split into list items (products) graves; return as a grave list
    random_time = round(random.uniform(1, 2),2)
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(crawl_URL)
    time.sleep(random_time)
    html_content = driver.page_source
    driver.quit()
    list_graves = str(BeautifulSoup(html.unescape(html_content), 'html.parser')).split('<li class')
    
    return list_graves


# Crawls through a list of links and creates product class filtered graves for each webpage: HEAVY OPERATION <ONLY RUN WHEN 100% SURE>

def link_crawl(listed_links) -> list(list()):
    
    # Step 3: For each link in the Xcite_links_filtered list, perform grave_list splitting, filtering, and store in graveyard
    filtered_graveyard = []
    for link in listed_links:
        filtered_graveyard.append(filter_graves(grave_list(link)))
    
    return filtered_graveyard


# Crawls through a graveyard of product class filtered graves and parses product information into a product dictionary

def cemetary_product_parse(filtered_cemetary) -> dict:
    
    # Step 4: Parse information to build the product dictionary
    product_dictionary = {}
    
    for graveyard in filtered_cemetary:
        for filtered_grave in graveyard:

            soup = BeautifulSoup(filtered_grave, 'html.parser')

            # product_name
            p_tag = soup.find('p')
            p_text = p_tag.get_text(strip=True)

            product_dictionary[p_text] = {
                'product_brand':'', 
                'product_price':'', 
                'product_discount':'', 
                'price_before_discount':'', 
                'product_link':'', 
                'product_image':''
            }

            # product_brand
            try:
                h5_tag = soup.find('h5')
                h5_text = h5_tag.get_text(strip=True)
                product_dictionary[p_text]['product_brand'] = h5_text
            except Exception as error:
                print(f"No info because: {error}")

            # product_price
            try:
                span_tag = soup.find('span', 'text-2xl text-functional-red-800 block mb-2')
                span_text = span_tag.get_text(strip=True)
                product_dictionary[p_text]['product_price'] = span_text
            except Exception as error:
                try:
                    h4_tag = soup.find('h4')
                    h4_text = h4_tag.get_text(strip=True)
                    product_dictionary[p_text]['product_price'] = h4_text
                except Exception as error:
                    print(f"No info because: {error}")


            # product_discount
            try:
                span_tag2 = soup.find('span', 'text-base bg-functional-red-600 text-white px-2 py-[3px] leading-1 align-text-top inline-block font-normal')
                span2_text = span_tag2.get_text(strip=True)
                product_dictionary[p_text]['product_discount'] = span2_text
            except Exception as error:
                print(f"No info because: {error}")

            # price_before_discount
            try:
                span_tag3 = soup.find('span', 'text-base line-through')
                span3_text = span_tag3.get_text(strip=True)
                product_dictionary[p_text]['price_before_discount'] = span3_text
            except Exception as error:
                print(f"No info because: {error}")

            # product_link
            try:
                a_tag = soup.find('a')
                href_link = a_tag['href']
                product_dictionary[p_text]['product_link'] = href_link
            except Exception as error:
                print(f"No info because: {error}")

            # product_image
            try:
                img_tags = soup.find_all('img')
                if len(img_tags) >= 2:
                    img_tag2 = img_tags[1]
                src_link = img_tag2['src']
                product_dictionary[p_text]['product_image'] = src_link
            except Exception as error:
                print(f"No info because: {error}")

    return product_dictionary

In [1]:
# Prompt

print("Successfully loaded. Print available functions? [Y]")
response = input()
if response in ['Y', 'y']:
    print("""
    menu_crawl(main_URL, menu_items_list, menu_button)
    
    def grave_list(crawl_URL)
    
    link_crawl(listed_links)
    
    cemetary_product_parse(filtered_cemetary)
    """)

Successfully loaded. Print available functions? [Y]
y

    menu_crawl(main_URL, menu_items_list, menu_button)
    
    def grave_list(crawl_URL)
    
    link_crawl(listed_links)
    
    cemetary_product_parse(filtered_cemetary)
    
