In [57]:
"""

MJ Price Scraper Tool v1.0

"""

# List of dispensaries to scrape and preset filters to pick from

dispos = {
    "Berkshire Roots" : "https://dutchie.com/dispensary/berkshire-roots-east-boston",
    "Mission Brookline" : "https://dutchie.com/dispensary/mission-brookline",
    "NETA" : "https://dutchie.com/dispensary/neta-brookline",
    "Western Front" : "https://dutchie.com/dispensary/western-front1",
    "MedMen" : "https://dutchie.com/dispensary/medmen-fenway",
    "Rise" : "https://dutchie.com/dispensary/greenstar-herbals-inc-chelsea",
    "Happy Valley" : "https://dutchie.com/dispensary/happy-valley-east-boston",
    "Pure Oasis" : "https://dutchie.com/dispensary/pure-oasis1",
    "Ayr" : "https://dutchie.com/dispensary/ayr-dispensary-back-bay"
}

filters = {
    "Concentrate" : "/products/concentrates?sortby=pricelowtohigh",
    "Flower" : "/products/flower?potencycbd=0%2C50&potencythc=0%2C50&sortby=pricelowtohigh",
    "Carts" : "/products/vaporizers?sortby=pricelowtohigh"
}

# Change this variable to one of the above choices: (Concentrate, Flower, Carts, ...)

filter = "Flower"


In [58]:
# [1]
# Imports
import requests
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# Session options
options = Options()
options.page_load_strategy = 'normal'
options = webdriver.ChromeOptions()
#driver = webdriver.Chrome('chromedriver',options=chrome_options)
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

In [59]:
#[2]
# Product class w/ attrs.
class Product:
  def __init__(self, name, company, type, thc, prices, dispo, dpg):
    self.name = name
    self.company = company
    self.type = type
    self.thc = thc
    self.prices = prices
    self.dispo = dispo
    self.dpg = dpg
    

In [72]:
# Takes in the details and prices elements and parses through to find the info
# Returned as a single product
def parseProductElements(details_element, price_element):

    _weight = ""
    _price = ""
    _oldprice = ""
    _discount = ""
        
    name_element = details_element.find_element(By.CLASS_NAME, "mobile-product-list-item__ProductName-zxgt1n-6")
    name_text = name_element.text 

    try:
        company_element = details_element.find_element(By.CLASS_NAME, "mobile-product-list-item__Brand-zxgt1n-3")
        company_text = company_element.text
    except:
        company_text = "Company Info Not Found"

    try:
        info_element = details_element.find_element(By.CLASS_NAME, "mobile-product-list-item__DetailsContainer-zxgt1n-1")
        info_text = info_element.text
    except:
        info_text = "Info Not Found"
    
    if (price_element.get_attribute("class") == "mobile-product-list-item__MultipleOptionsContainer-zxgt1n-2"): #Multiple Prices
        # prices and weight elements contained inside button inside div container
        div_elements = price_element.find_elements(By.TAG_NAME, "div")
        for div in div_elements:
            button_element = div.find_element(By.CLASS_NAME, "clickable__StyledButton-uqcx8d-0")
            _weight = button_element.find_element(By.CLASS_NAME, "weight-tile__Label-otzu8j-4").text
            prices = button_element.find_elements(By.CLASS_NAME, "weight-tile__PriceText-otzu8j-5")
            if (len(prices) > 1): #product is dicounted
                _discount = button_element.find_element(By.CLASS_NAME, "weight-tile__DiscountLabel-otzu8j-0").text
                for price in prices:
                    element_class = price.get_attribute("class")
                    for c in element_class.split(" "):
                        if (c == "weight-tile__StrikedOutPrice-otzu8j-6"): # discounted element
                            _oldprice = price.text
                        else:
                            _price = price.text

    else: # Only one option/price
        _weight = price_element.find_element(By.CLASS_NAME, "weight-tile__Label-otzu8j-4").text
        prices = price_element.find_elements(By.CLASS_NAME, "weight-tile__PriceText-otzu8j-5")
        if (len(prices) > 1): #product is discounted (new price and old price share the same class)
            _discount = price_element.find_element(By.CLASS_NAME, "weight-tile__DiscountLabel-otzu8j-0").text
            for price in prices:
                element_class = price.get_attribute("class")
                for c in element_class.split(" "):
                    if (c == "weight-tile__StrikedOutPrice-otzu8j-6"): # discounted element
                        _oldprice = price.text
                    else:
                        _price = price.text


    product = Product(
        name=name_text,
        company=company_text,
        type=info_text,
        thc=info_text,
        prices = [
            {
                "weight" : _weight,
                "price" : _price,
                "oldprice" : _oldprice,
                "discount" : _discount
            }
        ],
        dispo="dispo",
        dpg=0
    )

    return product

def oz_to_g(oz):
    if (oz == "1/8"):
        return "3.5"
    elif (oz == "1/4"):
        return "7.0"
    elif (oz == "1/2"):
        return "14.0"
    elif (oz == "1"):
        return "28.0"
    else:
        return "0.01" #N/A

In [73]:
#[3]
# Function to scrape URL and returns list of product elements to later be parsed through
def scrapeURL(driver, dispo, filter): #DONE: Add filter var

    # Connect to URL
    filterURL = dispos[dispo] + filters[filter]

    driver.get(filterURL)
    driver.implicitly_wait(1.5)
    #time.sleep(5)

    # Age verification
    try:
        driver.find_element(By.CLASS_NAME, "age-confirmation-modal__StyledButton-di8wrk-0").click()
    except:
        print("")

    # Smooth scroll to scrub ALL data
    scheight = .1
    while scheight < 9.9:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/%s);" % scheight)
        scheight += .0075

    # Main element with all products
    main_element = driver.find_element(By.TAG_NAME, "main")
    
    # Searching for the product lists
    element_products = main_element.find_element(By.CLASS_NAME, "product-list__Container-sc-1arkwfu-1")
    element_div = element_products.find_element(By.TAG_NAME, "div")
    element_prod_list = element_div.find_elements(By.CLASS_NAME, "mobile-product-list-item__Container-zxgt1n-0")
    prod_list = element_prod_list[1:len(element_prod_list)-1]

    # Scraping the product details elements
    elements_products_info = []
    ignored_products = 0
    for elem in prod_list:
        # #if zxgt1n-4 exists
        if (len(elem.find_elements(By.CLASS_NAME, "mobile-product-list-item__ProductDetails-zxgt1n-4")) > 0): 
            details_element = elem.find_element(By.CLASS_NAME, "mobile-product-list-item__ProductDetails-zxgt1n-4")
            # Scraping prices (weights, prices, discounts)
            if ( len(elem.find_elements(By.CLASS_NAME, "mobile-product-list-item__MultipleOptionsContainer-zxgt1n-2")) > 0): # product has multiple options/prices
                prices_element = elem.find_element(By.CLASS_NAME, "mobile-product-list-item__MultipleOptionsContainer-zxgt1n-2") 
                elements_products_info.append((details_element, prices_element))
            else:
                prices_element = elem.find_element(By.CLASS_NAME, "clickable__StyledButton-uqcx8d-0")
                elements_products_info.append((details_element, prices_element))
       
        elif (len(elem.find_elements(By.CLASS_NAME, "mobile-product-list-item__ProductInfoContainer-zxgt1n-5")) > 0): #if zxgt1n-5 exists (inner container) && outside product details container
            temp = elem.find_element(By.CLASS_NAME, "mobile-product-list-item__ProductInfoContainer-zxgt1n-5")
            details_element = temp.find_element(By.CLASS_NAME, "mobile-product-list-item__ProductDetails-zxgt1n-4")
            # Scraping prices (weights, prices, discounts)

            if ( len(details_element.find_elements(By.CLASS_NAME, "mobile-product-list-item__MultipleOptionsContainer-zxgt1n-2")) > 0): # product has multiple options/prices
                prices_element = elem.find_element(By.CLASS_NAME, "mobile-product-list-item__MultipleOptionsContainer-zxgt1n-2") 
                elements_products_info.append((details_element, prices_element))
            elif ( len(details_element.find_elements(By.CLASS_NAME, "clickable__StyledButton-uqcx8d-0")) > 0):
                prices_element = elem.find_element(By.CLASS_NAME, "clickable__StyledButton-uqcx8d-0")
                elements_products_info.append((details_element, prices_element))
        
                

        else:
            print("Product scrape failed from ", dispo)
            ignored_products+=1
        
        
    # Parsing the product details elements
    products = []
    for elem in elements_products_info:
        products.append(parseProductElements(elem[0], elem[1]))
    print(len(products), "products scraped from", dispo, "\n", str(ignored_products), "products not scraped")
  
    return products


In [74]:
#[4] Grabbing product elements from urls
products = [] #TODO: Change to "elements"
i=0
for dispo in dispos: #TODO: Fix Happy Valley prices
    dispo_products = scrapeURL(driver, dispo, filter)
    products.extend(dispo_products)
    i+=1
    print("[", i, "/", len(dispos), "] Dispos scraped...")


38 products scraped from Berkshire Roots 
 0 products not scraped
[ 1 / 9 ] Dispos scraped...



NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".weight-tile__DiscountLabel-otzu8j-0"}
  (Session info: chrome=106.0.5249.119)
Stacktrace:
Backtrace:
	Ordinal0 [0x00D7DF13+2219795]
	Ordinal0 [0x00D12841+1779777]
	Ordinal0 [0x00C2423D+803389]
	Ordinal0 [0x00C53025+995365]
	Ordinal0 [0x00C531EB+995819]
	Ordinal0 [0x00C49531+955697]
	Ordinal0 [0x00C6E844+1108036]
	Ordinal0 [0x00C494B4+955572]
	Ordinal0 [0x00C6EA14+1108500]
	Ordinal0 [0x00C7F192+1175954]
	Ordinal0 [0x00C6E616+1107478]
	Ordinal0 [0x00C47F89+950153]
	Ordinal0 [0x00C48F56+954198]
	GetHandleVerifier [0x01072CB2+3040210]
	GetHandleVerifier [0x01062BB4+2974420]
	GetHandleVerifier [0x00E16A0A+565546]
	GetHandleVerifier [0x00E15680+560544]
	Ordinal0 [0x00D19A5C+1808988]
	Ordinal0 [0x00D1E3A8+1827752]
	Ordinal0 [0x00D1E495+1827989]
	Ordinal0 [0x00D280A4+1867940]
	BaseThreadInitThunk [0x76227BA9+25]
	RtlInitializeExceptionChain [0x7756BB3B+107]
	RtlClearBits [0x7756BABF+191]


In [None]:
#[5] Parsing Elements
prods = []
for element in products:
    prods.append(parseProductElements(element))

In [27]:
#[5] Sorting the products and display function
p = products
p.sort(key=lambda x: x.dpg, reverse=False)

def product_to_string():

        ret = []
        sep = "\n"

        for pr in p:
                parsed_product = pr.name + " [$" + str(pr.price) + "/" + str(pr.weight) + "g] - " + pr.dispo
                ret.append(parsed_product)
                #print(pr.name, pr.price, pr.weight, pr.dispo)
        
        return sep.join(ret)

print(len(p))
print(product_to_string())

183



In [None]:
#[6] Saving to logs

import datetime

date =  datetime.datetime.now()
date_fmt = date.strftime("%Y-%m-%d %I-%M-%p")
file_name = "logs/" + filter + "/" + date_fmt + ".txt"
product_csv = product_to_string()

log = open(file_name, "w")

log.write(date_fmt + "\n\n")
log.write("Dispo Deals\n-----------\n")

#TODO: Add Dispo Home Blurb
log.write("Berkshire Roots:\n")
log.write("Sample block of text\n\n\n")

header = str(len(p)) + " Products (" + filter + "):\n"

log.write(header)

log.write(product_csv)

log.close()