In [27]:
import requests
import re
from bs4 import BeautifulSoup

URL = 'https://www.onedayonly.co.za'
headers = {
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

**Breakdown**

* a-tag, class='gallery-item'
    - div (Parent)
        - div, class='image' --> **Product Image**
    - div, class='css-*-GalleryProduct
        - h2, color=black --> **Brand**
        - h2, color=darkGrey --> **Product Name**
    - div (Parent)
        - h2, color=black --> **Price**
        - h2, color=darkGrey --> **Retail Price**


In [28]:
products = soup.find_all('a', class_='gallery-item')

### Extract Product Detail

* Brand
* Title

In [29]:
import re

def extract_product_detail(product_tag):

    product_divs = product_tag.find_all('div', {'class': 'css-13ylwge-GalleryProduct'})
    
    brand, title = None, None
    for div in product_divs:
        h2_tags = div.select('div h2')
        if h2_tags:
            assert len(h2_tags) == 2, 'Found more than 2 h2 tags'
            brand = h2_tags[0].get_text()
            title = h2_tags[1].get_text()
            
    return brand, title

In [30]:
import numpy as np
idx = np.random.randint(len(products))
idx

34

In [31]:
brand, title = extract_product_detail(products[idx])
brand, title

('Homemark', 'Invisible Strapless Bra')

In [32]:
def str_to_float(s)->float:
    return float(re.sub('[^0-9.\-]','',s))

def extract_prices(product_tag):
    
    prices_div = product_tag.find('div', {'class': 'css-zzliqo'})
    
    if not prices_div:
        print('Could not find prices div')
        return None, None
    
        
    retail = (prices_div
              .find('h2', {'color': 'darkGrey', 'class': re.compile('css-.*-GalleryProduct')})
              .get_text())
    
    selling = (prices_div
              .find('h2', {'color': 'black', 'class': re.compile('css-.*-GalleryProduct')})
              .get_text())
    
    retail = str_to_float(retail)
    selling = str_to_float(selling)
    
    return retail, selling

In [33]:
retail, selling = extract_prices(products[idx])
retail, selling

(300.0, 199.0)

## Combine and run for all

In [34]:

def extract_product_info(product):
    
    product_obj = {}
    
    brand, title = extract_product_detail(product)
    
    product_obj['brand'] = brand
    product_obj['shortname'] = title
    
    retail, selling = extract_prices(product)
    
    product_obj['price_retail'] = retail
    product_obj['price_selling'] = selling
    
    return product_obj

In [35]:
product_info = {'data': []}
for product in products:
    if not product or ('onenightonly' in product.get('href')):
        continue
    product_info['data'].append(extract_product_info(product))

In [36]:
import pandas as pd

df = pd.DataFrame(product_info['data'])
df.head()

Unnamed: 0,brand,shortname,price_retail,price_selling
0,La Germania,Europa 90cm 5-Burner Oven,16200.0,12499.0
1,Canterbury,Men's Pro Dry Tee,500.0,149.0
2,The Fox Tan,Candy Oil + Sunbed Accelerator,1020.0,899.0
3,Picnic Time,Beach Bats & Balls,200.0,139.0
4,Destiny,Swarovski Birthstone Earrings,250.0,179.0


In [37]:
df

Unnamed: 0,brand,shortname,price_retail,price_selling
0,La Germania,Europa 90cm 5-Burner Oven,16200.0,12499.0
1,Canterbury,Men's Pro Dry Tee,500.0,149.0
2,The Fox Tan,Candy Oil + Sunbed Accelerator,1020.0,899.0
3,Picnic Time,Beach Bats & Balls,200.0,139.0
4,Destiny,Swarovski Birthstone Earrings,250.0,179.0
...,...,...,...,...
82,Leonardo,Set of 2 Perla Glasses,670.0,399.0
83,Leonardo,Serving Platter with Bowls,1050.0,579.0
84,WDR,1080P Dual Camera Dash Cam,800.0,499.0
85,Salus,Camera Backpack,600.0,349.0


In [None]:
df.sort_values(by='price_retail')

In [None]:
products[38]

In [None]:
from datetime import datetime

now = datetime.now()

now.date()

In [None]:
f = '../data/{}.csv'.format(str(now.date()))
print(f)
df.to_csv(f)