In [2]:
import requests
import re
from bs4 import BeautifulSoup

URL = 'https://www.onedayonly.co.za'
headers = {
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

**Breakdown**

* a-tag, class='gallery-item'
    - div (Parent)
        - div, class='image' --> **Product Image**
    - div, class='css-*-GalleryProduct
        - h2, color=black --> **Brand**
        - h2, color=darkGrey --> **Product Name**
    - div (Parent)
        - h2, color=black --> **Price**
        - h2, color=darkGrey --> **Retail Price**


In [3]:
products = soup.find_all('a', class_='gallery-item')

### Extract Product Detail

* Brand
* Title

In [23]:
import re

def extract_product_detail(product_tag):

    product_divs = product_tag.find_all('div', {'class': re.compile('css-.*-GalleryProduct')})
    
    brand, title = None, None
    for div in product_divs:
        h2_tags = div.select('div h2')
        if h2_tags:
            assert len(h2_tags) == 2, 'Found more than 2 h2 tags'
            brand = h2_tags[0].get_text()
            title = h2_tags[1].get_text()
            
    return brand, title

In [28]:
import numpy as np
idx = np.random.randint(len(products))
idx

33

In [29]:
brand, title = extract_product_detail(products[idx])
brand, title

('Phoenix', 'Outdoor Sleeping Bag')

In [41]:
def str_to_float(s)->float:
    return float(re.sub('[^0-9.\-]','',s))

def extract_prices(product_tag):
    
    prices_div = product_tag.find('div', {'class': 'css-zzliqo'})
    
    if not prices_div:
        print('Could not find prices div')
        return None, None
    
        
    retail = (prices_div
              .find('h2', {'color': 'darkGrey', 'class': re.compile('css-.*-GalleryProduct')})
              .get_text())
    
    selling = (prices_div
              .find('h2', {'color': 'black', 'class': re.compile('css-.*-GalleryProduct')})
              .get_text())
    
    retail = str_to_float(retail)
    selling = str_to_float(selling)
    
    return retail, selling

In [42]:
retail, selling = extract_prices(products[idx])
retail, selling

(400.0, 299.0)

## Combine and run for all

In [45]:

def extract_product_info(product):
    
    product_obj = {}
    
    brand, title = extract_product_detail(product)
    
    product_obj['brand'] = brand
    product_obj['shortname'] = title
    
    retail, selling = extract_prices(product)
    
    product_obj['price_retail'] = retail
    product_obj['price_selling'] = selling
    
    return product_obj

In [50]:
product_info = {'data': []}
for product in products:
    if not product or ('onenightonly' in product.get('href')):
        continue
    product_info['data'].append(extract_product_info(product))

In [51]:
import pandas as pd

df = pd.DataFrame(product_info['data'])
df.head()

Unnamed: 0,brand,shortname,price_retail,price_selling
0,La Germania,Europa 90cm 5-Burner Oven,16200.0,12499.0
1,Canterbury,Men's Pro Dry Tee,500.0,149.0
2,The Fox Tan,Candy Oil + Sunbed Accelerator,1020.0,899.0
3,Picnic Time,Beach Bats & Balls,200.0,139.0
4,Destiny,Swarovski Birthstone Earrings,250.0,179.0


In [52]:
df

Unnamed: 0,brand,shortname,price_retail,price_selling
0,La Germania,Europa 90cm 5-Burner Oven,16200.0,12499.0
1,Canterbury,Men's Pro Dry Tee,500.0,149.0
2,The Fox Tan,Candy Oil + Sunbed Accelerator,1020.0,899.0
3,Picnic Time,Beach Bats & Balls,200.0,139.0
4,Destiny,Swarovski Birthstone Earrings,250.0,179.0
...,...,...,...,...
82,Leonardo,Set of 2 Perla Glasses,670.0,399.0
83,Leonardo,Serving Platter with Bowls,1050.0,579.0
84,WDR,1080P Dual Camera Dash Cam,800.0,499.0
85,Salus,Camera Backpack,600.0,349.0


In [53]:
df.sort_values(by='price_retail')

Unnamed: 0,brand,shortname,price_retail,price_selling
20,Glade,Automatic Spray & Holder,130.0,89.0
19,Ziploc,4 Piece Storage Bag Bundle,140.0,99.0
30,Mr Muscle,Kitchen Cleaning Bundle,150.0,109.0
48,Canterbury,Men's Playing Sock,160.0,99.0
29,Little Artist,Glitter Paint Set,200.0,129.0
...,...,...,...,...
71,Tramontina,13-Piece Braai Set,4800.0,2399.0
22,Google,Nest Hub Max Speaker,7499.0,5999.0
56,The Rug Warehouse,Persian Runner,7990.0,2349.0
23,The Rug Warehouse,Afghan Hand Woven Persian Rug,12000.0,5979.0


In [54]:
products[38]

<a class="gallery-item" href="https://onenightonly.co.za/?utm_source=OneDayOnly&amp;utm_medium=referral&amp;utm_content=promo_block" rel="noopener noreferrer" target="_blank"><div class="css-1857olo-fadeAnimations-ImageContainer"><div class="css-1kkgipb-ImageContainer"><style data-emotion-css="14j2g1s-ImageBlock">.css-14j2g1s-ImageBlock{display:block;max-width:100%;max-height:100%;min-width:100%;opacity:1;-webkit-transition-duration:512ms;transition-duration:512ms;-webkit-transition-property:opacity,visibility;transition-property:opacity,visibility;-webkit-transition-timing-function:ease-out;transition-timing-function:ease-out;visibility:visible;background-size:cover;background-position:center;background-image:url(https://cdn.onedayonly.co.za/resources/images/media/1378.png?auto=compress&w=800&h=800&bg=fff&fit=fill);}</style><style data-emotion-css="1fxmn25-ImageBlock">.css-1fxmn25-ImageBlock{box-sizing:border-box;-webkit-flex:1;-ms-flex:1;flex:1;display:block;max-width:100%;max-height

In [55]:
from datetime import datetime

now = datetime.now()

now.date()

datetime.date(2020, 12, 25)

In [None]:
f = '../data/{}.csv'.format(str(now.date()))
print(f)
df.to_csv(f)