In [2]:
import requests
import re
from bs4 import BeautifulSoup

URL = 'https://www.onedayonly.co.za'
headers = {
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, 'html.parser')

* Savings -  span class "savings" / h6 class "label" & "amount"
* image - img class "image"
* Brand - h2 class "brand"
* shortname - h2 class "shortname"
* short_description - p class "name"
* prices: span class "prices"
    - retail  - h6 class "retail" (Retail: R2,000) --> strip "Retail"
    - selling - h3 class "selling"

In [3]:
home = soup.find(id='home')

In [4]:
products = home.find_all('a', class_='new_product_block')

In [5]:
def str_to_float(s)->float:
    return float(re.sub('[^0-9.\-]','',s))

def extract_brand(product):
    
    brand = product.find('h2', class_='brand').text
    
    assert type(brand) == str,'Found non-text data during brand extraction'
    
    return brand


def extract_prices(product):
    
    prices = product.find('span', class_='prices')
    
    retail = prices.find('h6', class_='retail')
    selling = prices.find('h3', class_='selling')
    
    if selling:
        selling = str_to_float(selling.text)
    else:
        selling = None
    
    if retail:
        retail = str_to_float(retail.text)
    else:
        retail = None
    
    return {
        'retail': retail,
        'selling': selling
    }


def extract_savings(product):
    
    savings = product.find("span", class_='savings')
    
    if savings:
        return savings.find('h6', class_='amount').text
    
    else:
        return None


def extract_product_info(product):
    
    product_obj = {}
    
    product_obj['brand'] = extract_brand(product)
    product_obj['image_url'] = product.find('img',class_='image').attrs['src']
    product_obj['shortname'] = product.find('h2',class_='shortname').text
    product_obj['short_desc'] = product.find('p',class_='name').text
    
    prices = extract_prices(product)
    
    product_obj['price_retail'] = prices['retail']
    product_obj['price_selling'] = prices['selling']
    
    product_obj['savings'] = extract_savings(product)
    
    return product_obj

In [6]:
product_info = {'data': [], 'meta': {'date': '2020-05-01'}}
for product in products:
    product_info['data'].append(extract_product_info(product))

In [7]:
import pandas as pd

df = pd.DataFrame(product_info['data'])
df.head()

Unnamed: 0,brand,image_url,shortname,short_desc,price_retail,price_selling,savings
0,Roxy,https://cdni.onedayonly.co.za/catalog/product/...,Roxy 2.2mm Springsuit,Ladies 2.2mm Prologue Short Sleeve Back Zip Sp...,1500.0,799.0,47%
1,Metal,https://cdni.onedayonly.co.za/catalog/product/...,Metal Shelf Unit,4 or 5 Tier Heavy-Duty Metal Shelf Unit (Max. ...,1500.0,749.0,50%
2,Harman Kardon,https://cdni.onedayonly.co.za/catalog/product/...,Harman Kardon Sound Bar & Subwoofer,HK SB20 Bluetooth Sound Bar with Subwoofer,6400.0,4999.0,"R1,400"
3,Michael Beaumont,https://cdni.onedayonly.co.za/catalog/product/...,Michael Beaumont The Accidental Mayor,The Accidental Mayor: Herman Mashaba and the B...,260.0,179.0,31%
4,Segway - Ninebot,https://cdni.onedayonly.co.za/catalog/product/...,Segway - Ninebot ES2 Kickscooter,ES2 Kickscooter,12500.0,10499.0,"R2,000"


In [8]:
df.sort_values(by='savings')

Unnamed: 0,brand,image_url,shortname,short_desc,price_retail,price_selling,savings
122,Princess,https://cdni.onedayonly.co.za/catalog/product/...,Princess Junior Tracksuits,Unisex Full Junior Tracksuits,800.0,599.0,25%
131,Lexmark,https://cdni.onedayonly.co.za/catalog/product/...,Lexmark Printer Cartridges,Printer Cartridges,2400.0,1799.0,25%
114,Nina,https://cdni.onedayonly.co.za/catalog/product/...,Nina Gas Lift Bar Chair,PVC Gas Lift Bar Chair,600.0,449.0,25%
94,Viru-Guard,https://cdni.onedayonly.co.za/catalog/product/...,Viru-Guard Starter Pack,Phone and Surface Disinfectant Starter Pack,400.0,299.0,25%
135,Dromex,https://cdni.onedayonly.co.za/catalog/product/...,Dromex Thermal Jacket,Storm Wetpro or Apollo Reflective Water-Resist...,1100.0,829.0,25%
...,...,...,...,...,...,...,...
24,Aloe Unique,https://cdni.onedayonly.co.za/catalog/product/...,Aloe Unique Serum,30ml Age Defying Serum,,299.0,
28,FZK,https://cdni.onedayonly.co.za/catalog/product/...,FZK Non-Touch Thermometer,Non-Contact Infrared Digital Thermometer,,699.0,
48,Milex,https://cdni.onedayonly.co.za/catalog/product/...,Milex 1.7L Glass Kettle,1.7L New York Cordless Glass Kettle,,799.0,
95,Recova,https://cdni.onedayonly.co.za/catalog/product/...,Recova After Party Drink,Pack of 12 The Ultimate After Party Drink (340...,,299.0,


In [9]:
from datetime import datetime

now = datetime.now()

now.date()

datetime.date(2020, 6, 8)

In [10]:
f = '../data/{}.csv'.format(str(now.date()))
print(f)
df.to_csv(f)

../data/2020-06-08.csv
