In [1]:
import requests
import re
from bs4 import BeautifulSoup

URL = 'https://www.onedayonly.co.za'
headers = {
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

**Breakdown**

* a-tag, class='gallery-item'
    - div (Parent)
        - div, class='image' --> **Product Image**
    - div, class='css-*-GalleryProduct
        - h2, color=black --> **Brand**
        - h2, color=darkGrey --> **Product Name**
    - div (Parent)
        - h2, color=black --> **Price**
        - h2, color=darkGrey --> **Retail Price**


In [2]:
products = soup.find_all('a', class_='gallery-item')

### Extract Product Detail

* Brand
* Title

In [3]:
import re

def extract_product_detail(product_tag):

    product_divs = product_tag.find_all('div', {'class': 'css-13ylwge-GalleryProduct'})
    
    brand, title = None, None
    for div in product_divs:
        h2_tags = div.select('div h2')
        if h2_tags:
            assert len(h2_tags) == 2, 'Found more than 2 h2 tags'
            brand = h2_tags[0].get_text()
            title = h2_tags[1].get_text()
            
    return brand, title

In [16]:
import numpy as np
# idx = np.random.randint(len(products))
idx = 30
idx

30

In [17]:
brand, title = extract_product_detail(products[idx])
brand, title

('Guard', 'Reusable Face Masks')

'/products/pack-of-4-reusable-triple-layer-fabric-face-masks-20201215'

In [20]:
def str_to_float(s)->float:
    return float(re.sub('[^0-9.\-]','',s))

def extract_prices(product_tag):
    
    prices_div = product_tag.find('div', {'class': 'css-zzliqo'})
    
    if prices_div is None:
        print('Could not find prices div')
        return None, None
    
    
    try:
        retail = (prices_div
                  .find('h2', {'color': 'darkGrey', 'class': re.compile('css-.*-GalleryProduct')})
                  .get_text())
        retail = str_to_float(retail)
    except AttributeError:
        retail = None
    
    
    try:
        selling = (prices_div
                  .find('h2', {'color': 'black', 'class': re.compile('css-.*-GalleryProduct')})
                  .get_text())
        selling = str_to_float(selling)
    except AttributeError:
        selling = None
    
    
    return retail, selling

In [21]:
retail, selling = extract_prices(products[idx])
retail, selling

(None, 109.0)

## Combine and run for all

In [27]:

def extract_product_info(product):
    
    product_obj = {}
    
    brand, title = extract_product_detail(product)
    
    product_obj['brand'] = brand
    product_obj['shortname'] = title
    
    retail, selling = extract_prices(product)
    
    product_obj['price_retail'] = retail
    product_obj['price_selling'] = selling
    
    product_obj['url'] = 'www.onedayonly.co.za' + product.get('href')
    
    return product_obj

In [28]:
product_info = {'data': []}
for i, product in enumerate(products):
    print(i)
    if not product or ('onenightonly' in product.get('href')):
        continue
    product_info['data'].append(extract_product_info(product))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


In [51]:
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

df = pd.DataFrame(product_info['data'])
df.head()

Unnamed: 0,brand,shortname,price_retail,price_selling,url
0,Bose,QuietComfort 35 Series II Wireless Headphones,10000.0,5499.0,www.onedayonly.co.za/products/bose-quietcomfort-35-series-ii-120919-20201209
1,Global,14cm SAI Cooks Knife,1550.0,1149.0,www.onedayonly.co.za/products/14cm-sai-cooks-knife-20201215
2,M&M's,Pack of 24 Candy Bags,670.0,549.0,www.onedayonly.co.za/products/pack-of-24-candy-bags
3,Soul Beauty,Face Roller & Gua Sha Set,700.0,399.0,www.onedayonly.co.za/products/roll-play-100-crystal-face-roller-and-gua-sha-set-20201210
4,O'Neill,Salt Water Solid Bikini Set,700.0,349.0,www.onedayonly.co.za/products/salt-water-solid-bikini-set-3-20201215


In [52]:
df

Unnamed: 0,brand,shortname,price_retail,price_selling,url
0,Bose,QuietComfort 35 Series II Wireless Headphones,10000.0,5499.0,www.onedayonly.co.za/products/bose-quietcomfort-35-series-ii-120919-20201209
1,Global,14cm SAI Cooks Knife,1550.0,1149.0,www.onedayonly.co.za/products/14cm-sai-cooks-knife-20201215
2,M&M's,Pack of 24 Candy Bags,670.0,549.0,www.onedayonly.co.za/products/pack-of-24-candy-bags
3,Soul Beauty,Face Roller & Gua Sha Set,700.0,399.0,www.onedayonly.co.za/products/roll-play-100-crystal-face-roller-and-gua-sha-set-20201210
4,O'Neill,Salt Water Solid Bikini Set,700.0,349.0,www.onedayonly.co.za/products/salt-water-solid-bikini-set-3-20201215
...,...,...,...,...,...
76,Air Scents,Pack of 3 Reed Diffusers,300.0,249.0,www.onedayonly.co.za/products/pack-of-3-100ml-brown-reed-diffuser-with-rattan-sticks-20201215
77,Fine Living,White Laptop Desk Stand,500.0,299.0,www.onedayonly.co.za/products/white-laptop-desk-stand-502764
78,Solar,Dual USB Power Bank,300.0,199.0,www.onedayonly.co.za/products/dual-usb-outdoor-power-bank-multiple-colours-available-16092019-20201215
79,Remedy Health,Earwax Removal Swab's,200.0,129.0,www.onedayonly.co.za/products/earwax-removal-swab-with-16-replacement-heads-502674


In [None]:
df.sort_values(by='price_retail')

In [None]:
products[38]

In [None]:
from datetime import datetime

now = datetime.now()

now.date()

In [None]:
f = '../data/{}.csv'.format(str(now.date()))
print(f)
df.to_csv(f)