In [1]:
import requests
import re
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
URL = 'https://www.contemporaryceramics.uk/shop/'
page = requests.get(URL)

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
main_page = soup.find(id='products_list')

get all unique URLs contained in the "SHOP" section. Then we will loop through these, pinging each and extracting the relevant parameters for each pot

In [5]:
#empty list of links.
links = []
for link in main_page.find_all('a', attrs={'href': re.compile("^https://www.contemporaryceramics.uk/product/")}):
    links.append(link.get('href'))

#Inspect to see if the list looks appropriate
print(len(links))
print(links[123])

1201
https://www.contemporaryceramics.uk/product/oval-bottle-vase-2/


Load up the pots.csv to append new rows of data

Loop through all pots to obtain all features.

In [6]:
df = pd.DataFrame(columns = ['ProductCode', 'Price', 'Maker', 'Name', 'Description', 'Decoration', 
                             'Depth', 'Firing Technique', 'Height', 'Materials',
                             'Signed', 'Technique', 'Width'])

In [7]:
for i in links:
    URL = i
    page = requests.get(URL)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    result = soup.find('div', class_="ProductDetail")
    
    #Scrape productcode, price and maker
    productCode = result.find('span', class_ = 'sku').text.strip("PRODUCT CODE:")
    price = result.find('span', class_ = 'woocommerce-Price-amount amount').text
    maker = result.find('div', class_ = 'maker')
    
    #avoid any thrown errors from fact that some pots are makerless
    if maker:
        maker = maker.text
    else:
        maker = np.nan

    # Scrape name, description plus other attributes   
    name = result.find('h1').text
    description = result.find('div', class_='woocommerce-product-details__short-description').find('p').text
    
    attributes = {'ProductCode':productCode, 'Price':price, 'Maker':maker, 'Name':name, 'Description':description}
    
    productDetails = result.find_all('div', class_ = 'attribute')
    
    # Attributes are contained in a seperate HTML block. Loop throup that and plus in above dictionary
    for j in productDetails:
        attribute = j.find('div', class_ = 'col-md-6 col-xs-6 NoPadding')
        value = j.find('div', class_ = 'col-md-6 col-xs-6 NoPadding ProductAttributesInput')
        attributes[attribute.text[:-1]] = value.text
    
    #Stick dictionary into dataframe.
    #Pandas automaticall fills in with NaN for missing attributes (not all pots have the same attributes)
    resultsSubset = pd.DataFrame([attributes])
    
    df = pd.concat([df, resultsSubset], axis=0, sort=True)

Basic data cleaning to ensure data types are appropriate. This dataset will then get cleaned in another notebook

In [9]:
data = df.copy()

In [10]:
data["Depth"] = data["Depth"].str.replace(' cm', '')
data["Depth"] = pd.to_numeric(data["Depth"])

data["Height"] = data["Height"].str.replace(' cm', '')
data["Height"] = pd.to_numeric(data["Height"])

data["Width"] = data["Width"].str.replace(' cm', '')
data["Width"] = pd.to_numeric(data["Width"])

data["Price"] = data["Price"].str.replace('£', '')
data["Price"] = data["Price"].str.replace(',', '')
data["Price"] = pd.to_numeric(data["Price"])

In [11]:
data = data.set_index('ProductCode')

In [12]:
#reorder for interpretability
data = data[["Price", "Maker", "Name", "Description", "Height", "Width", "Depth", 
    "Materials", "Technique", "Signed", "Firing Technique", "Decoration"]]

In [14]:
data.head(5)

Unnamed: 0_level_0,Price,Maker,Name,Description,Height,Width,Depth,Materials,Technique,Signed,Firing Technique,Decoration
ProductCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LK256Y452,280.0,Lisa Katzenstein,gourd vase nasturtiums,This exuberant earthenware vase is slip cast a...,26.0,14.0,,Earthenware,Slipcast,Signed,,
LK256Y456,380.0,Lisa Katzenstein,tall twist vase peapods,This exuberant earthenware vase is slip cast a...,41.0,15.5,7.5,Earthenware,Slipcast,Signed,,
LK256Y455,300.0,Lisa Katzenstein,spear vase succulents,This exuberant earthenware vase is slip cast a...,32.0,13.0,,Earthenware,Slipcast,Signed,,
LK256Y454,300.0,Lisa Katzenstein,spear vase chillies,This exuberant earthenware vase is slip cast a...,32.0,13.0,,Earthenware,Slipcast,Signed,,
LK256Y453,280.0,Lisa Katzenstein,gourd vase honesty,This exuberant earthenware vase is slip cast a...,26.0,14.0,,Earthenware,Slipcast,Signed,,


In [15]:
data.to_csv("Pots.csv")