In [71]:
import requests
import re
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

In [72]:
URL = 'https://www.contemporaryceramics.uk/shop/'
page = requests.get(URL)

In [73]:
soup = BeautifulSoup(page.content, 'html.parser')

In [74]:
main_page = soup.find(id='products_list')

get all unique URLs contained in the "SHOP" section. Then we will loop through these, pinging each and extracting the relevant parameters for each pot

In [75]:
links = []
for link in main_page.find_all('a', attrs={'href': re.compile("^https://www.contemporaryceramics.uk/product/")}):
    links.append(link.get('href'))

Loop through all pots to obtain all features.

In [76]:
df = pd.DataFrame(columns = ['ProductCode', 'Price', 'Maker', 'Name', 'Description', 'Decoration', 
                             'Depth', 'Firing Technique', 'Height', 'Materials',
                             'Signed', 'Technique', 'Width'])

In [77]:
for i in links:
    URL = i
    page = requests.get(URL)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    result = soup.find('div', class_="ProductDetail")
    
    productCode = result.find('span', class_ = 'sku').text.strip("PRODUCT CODE:")
    
    price = result.find('span', class_ = 'woocommerce-Price-amount amount').text
    maker = result.find('div', class_ = 'maker')
    if maker:
        maker = maker.text
    else:
        maker = np.nan
    name = result.find('h1').text
    description = result.find('div', class_='woocommerce-product-details__short-description').find('p').text
    
    attributes = {'ProductCode':productCode, 'Price':price, 'Maker':maker, 'Name':name, 'Description':description}
    
    productDetails = result.find_all('div', class_ = 'attribute')
    
    for j in productDetails:
        attribute = j.find('div', class_ = 'col-md-6 col-xs-6 NoPadding')
        value = j.find('div', class_ = 'col-md-6 col-xs-6 NoPadding ProductAttributesInput')
        attributes[attribute.text[:-1]] = value.text
    
    resultsSubset = pd.DataFrame([attributes])
    
    df = pd.concat([df, resultsSubset], axis=0, sort=True)
    

In [86]:
data = df.set_index("ProductCode")


In [87]:
data["Depth"] = data["Depth"].str.replace(' cm', '')
data["Depth"] = pd.to_numeric(data["Depth"])

data["Height"] = data["Height"].str.replace(' cm', '')
data["Height"] = pd.to_numeric(data["Height"])

data["Width"] = data["Width"].str.replace(' cm', '')
data["Width"] = pd.to_numeric(data["Width"])

data["Price"] = data["Price"].str.replace('£', '')
data["Price"] = data["Price"].str.replace(',', '')
data["Price"] = pd.to_numeric(data["Price"])

In [89]:
data = data[["Price", "Maker", "Name", "Description", "Height", "Width", "Depth", 
    "Materials", "Technique", "Signed", "Firing Technique", "Decoration"]]

In [91]:
data.to_csv("Pots.csv")