# Scraping Peace With The Wild Website

## Get Product Categories

In [10]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.peacewiththewild.co.uk/product-category/'

categories = []
try:
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    categories_list = soup.find('ul', class_='product-categories')

    for category in categories_list.find_all('li'):
        category_link = category.find('a').get('href')
        categories.append(category_link)

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

# Print the list of categories
for category in categories:
    print(category)

https://www.peacewiththewild.co.uk/product-category/bathroom/
https://www.peacewiththewild.co.uk/product-category/bathroom/dental-care/
https://www.peacewiththewild.co.uk/product-category/bathroom/bath-bombs/
https://www.peacewiththewild.co.uk/product-category/bathroom/bath-bombs/bath-salts/
https://www.peacewiththewild.co.uk/product-category/bathroom/bath-mats/
https://www.peacewiththewild.co.uk/product-category/bathroom/bathroom-cleaning/
https://www.peacewiththewild.co.uk/product-category/bathroom/bathroom-cleaning/toilet-brushes/
https://www.peacewiththewild.co.uk/product-category/bathroom/body-brushes/
https://www.peacewiththewild.co.uk/product-category/bathroom/body-wash/
https://www.peacewiththewild.co.uk/product-category/bathroom/body-wash/body-wash-bars-cubes/
https://www.peacewiththewild.co.uk/product-category/bathroom/body-wash/body-wash-foam-powder/
https://www.peacewiththewild.co.uk/product-category/bathroom/body-wash/body-wash-liquids/
https://www.peacewiththewild.co.uk/p

## Get Product links from each Category

In [None]:
# create a set to store unique product links
product_links = set()

# Loop through each category link
for category_link in categories:
    try:
        # Fetch the content of the category page
        response = requests.get(category_link)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # --- Identify the container for the product list ---
        product_list_container = soup.find('ul', class_='products')

        if product_list_container:
            # --- Find all child elements of the product list container ---
            # Using recursive=False to get only direct children
            child_elements = product_list_container.find_all('li', recursive=False) # recursive=False to get only direct children

            # --- Extract and add the product links to the set ---
            for child in child_elements:
                product_link = child.find('a')
                if product_link:
                    # Print the text of the link
                    href = product_link.get('href')
                    product_links.add(href)

        else:
            print("Could not find the product list container.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

print(len(product_links))


2602


In [13]:
# print 10 product links
for i, link in enumerate(product_links):
    if i < 10:
        print(link)
    else:
        break

https://www.peacewiththewild.co.uk/product/wooden-styling-comb/
https://www.peacewiththewild.co.uk/product/organic-linen-reusable-coffee-filters-no-4-2-pack/
https://www.peacewiththewild.co.uk/product/bamboo-toothbrush-soft-bristles-4-pack/
https://www.peacewiththewild.co.uk/product/wide-neck-baby-glass-bottle-with-sleeve-seafoam-blue-250ml/
https://www.peacewiththewild.co.uk/product/eye-time-caffeinated-probiotic-eye-cream/
https://www.peacewiththewild.co.uk/product/rose-hip-coconut-facial-soap-bar-bain-savon/
https://www.peacewiththewild.co.uk/product/calm-relaxing-plastic-free-tea-bags/
https://www.peacewiththewild.co.uk/product/matcha-spoon-2-5ml/
https://www.peacewiththewild.co.uk/product/shampoo-bar-rosemary-dark-hair-bain-savon/
https://www.peacewiththewild.co.uk/product/olive-oil-soap-strawberry-and-mint-125g/


## Get Product information from the extracted links

In [None]:
import pandas as pd

product_links = list(product_links) # convert set to list
product_list = [] # list to store product detail dictionaries

In [None]:
# loop through the product links and fetch details
for i in range(2595, len(product_links)):
    try:
        # Fetch the content of the product page
        response = requests.get(product_links[i])
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        continue
    soup = BeautifulSoup(response.text, 'html.parser')

    # --- Extract product information ---
    title = soup.find('h1', class_='product_title').text

    # Check if the brand is available
    if soup.find('span', itemprop='brand').find('a'):
        brand = soup.find('span', itemprop='brand').find('a').text
    else:
        # If no brand is found, set it to None or an empty string
        brand = None

    categories = [category.text for category in soup.find('nav', class_='woocommerce-breadcrumb').find_all('a')]
    # skip if product is from the category 'Gift Sets' or 'Eco-Friendly Kits'
    if 'Gift Sets' in categories or 'Eco-Friendly Kits' in categories:
        continue

    description = ' '.join([desc.text for desc in soup.find('div', class_='woocommerce-product-details__short-description').find_all('p')])
    
    # Check if the product has tags
    if soup.find('div', class_='product_icons_container'):
        tags = [tag.text for tag in soup.find('div', class_='product_icons_container').find_all('div', class_='product_icon_title')]
    else:
        # If no tags are found, set tags to an empty list
        tags = []

    image_link = soup.find('img', class_='wp-post-image').get('src')

    site_link = product_links[i]

    # Append the product info to the list
    product_list.append({
        'title': title,
        'brand': brand,
        'categories': categories,
        'description': description,
        'tags': tags,
        'image-link': image_link,
        'site-link': site_link
    })
    print(f"Fetched product {i + 1}/{len(product_links)}: {title}")
product_list[0]


Fetched product 2597/2602: Pure Radiance Hair Milk – 250ml
Fetched product 2598/2602: Beeswax Wraps – Sandwich & Big Bowl 2 Pack – Italian Kitchen
Fetched product 2599/2602: Pure Linen Peg Bag – Stripes
Fetched product 2600/2602: Lemongrass & Tea Tree Soap Bar – 100g
Fetched product 2601/2602: Peppa Pig Wash Bar with Rose Water & Aloe Vera – 80g
Fetched product 2602/2602: Lemongrass Shaving Soap – 100g


{'title': 'Wooden Styling Comb',
 'brand': 'Eco Living',
 'categories': ['Haircare', 'Brushes & Combs'],
 'description': 'A beautiful beech wood comb with rounded teeth. Natural or wooden bristles are gentle to the hair structure and avoid damage, would suit thick or curly hair.',
 'tags': ['Natural', 'Plastic Free', 'Biodegradable', 'Sustainable'],
 'image-link': 'https://www.peacewiththewild.co.uk/wp-content/uploads/2020/08/bamboo-comb-large-teeth-600x600.jpg',
 'site-link': 'https://www.peacewiththewild.co.uk/product/wooden-styling-comb/'}

In [98]:
print(len(product_list))

2596


In [103]:
# Code block to debug errors that occur while getting product info and to check the product links for issues
link = "https://www.peacewiththewild.co.uk/product/organic-plain-flour/"

response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
description = ' '.join([desc.text for desc in soup.find('div', class_='woocommerce-product-details__short-description').find_all('p')])
#categories = [category.text for category in soup.find('nav', class_='woocommerce-breadcrumb').find_all('a')]
print(description)
print(soup.find('div', class_='woocommerce-product-details__short-description'))

Doves Farm Organic plain flour is perfect if you love the finer things in life! Cook with plain flour to make vegan cakes, cookies and sauces guilt free. This organic plain flour is ethically and sustainably grown and sourced in the UK! It is also suitable for vegans & vegetarians. Organic plain flour comes in varying sizes and is available in 500g or 1kg options. All our refill food is packaged in recyclable paper bags for sustainable living! Organic
Vegan
Palm Oil Free
Natural
Plastic Free
View all benefits 

Recyclable
View less benefits 


<div class="woocommerce-product-details__short-description">
<p><span style="font-weight: 400;">Doves Farm </span><span style="font-weight: 400;">Organic plain flour</span><span style="font-weight: 400;"> is perfect if you love the finer things in life! Cook with plain flour to make vegan cakes, cookies and sauces guilt free. </span><span style="font-weight: 400;">This organic plain flour is ethically and sustainably grown and sourced in the UK! 

## Convert to Dataframe and write to CSV

In [100]:
# create a dataframe to store product info
df = pd.DataFrame(data=product_list, columns=['title', 'brand', 'categories', 'description', 'tags', 'image-link', 'site-link'])
df

Unnamed: 0,title,brand,categories,description,tags,image-link,site-link
0,Wooden Styling Comb,Eco Living,"[Haircare, Brushes & Combs]",A beautiful beech wood comb with rounded teeth...,"[Natural, Plastic Free, Biodegradable, Sustain...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/woo...
1,Organic Linen Reusable Coffee Filters No 4 – 2...,Marley's Monsters,"[Kitchen, Kitchen Essentials]",Reusable coffee filters in the cone style by M...,"[Vegan, Natural, Plastic Free, Biodegradable, ...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/org...
2,Bamboo Toothbrush Soft Bristles – 4 Pack,Bambaw,"[Bathroom, Toothbrushes]",A family pack of 4 sustainably sourced bamboo ...,"[Vegan, Sustainable]",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bam...
3,Wide Neck Baby Glass Bottle With Sleeve – Seaf...,Hevea,"[Mama & Baby, Baby Bottles]","HEVEA Wide Neck Baby Glass Bottle, the first D...","[Vegan, Palm Oil Free, Plastic Free, Recyclabl...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/wid...
4,Eye Time – Caffeinated Probiotic Eye Cream,Awake Organics,"[Skincare, Eye Creams]",Eye Time is a lightweight eye cream that glide...,"[Vegan, Natural, Recyclable, Handmade, Made In...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/eye...
...,...,...,...,...,...,...,...
2591,Beeswax Wraps – Sandwich & Big Bowl 2 Pack – I...,Honey Bee Good,[Kitchen],A pack of 2 beeswax wraps made with 100% certi...,"[Natural, Plastic Free, Biodegradable, Handmad...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bee...
2592,Pure Linen Peg Bag – Stripes,Helen Round,"[For The Home, All Laundry, Laundry Accessories]","This practical peg bag, made from beautifully ...","[Plastic Free, Handmade, Sustainable, Made In UK]",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/pur...
2593,Lemongrass & Tea Tree Soap Bar – 100g,Wild Sage & Co,"[Bathroom, Soap Bars, Hand & Body Soap Bars]","Wild Sage & Co lemongrass and tea tree, a fres...","[Vegan, Natural, Plastic Free, Biodegradable, ...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/lem...
2594,Peppa Pig Wash Bar with Rose Water & Aloe Vera...,Good Bubble,"[Mama & Baby, Baby Skincare]","A super gently baby hair and body wash bar, su...","[Vegan, Natural, Plastic Free, Handmade, Made ...",https://www.peacewiththewild.co.uk/wp-content/...,https://www.peacewiththewild.co.uk/product/bab...


### Fixing Descriptions
Some descriptions had tags copied into them from display:none columns made for mobile view. Erroneous description example:

"Nutritious and delicious, flax seeds are a food pantry essential. Great for a vegan diet, flax seeds are a perfect source of minerals, fibre and protein. Chia seeds are a versatile ingredient that can be used in puddings, breakfasts, salads and hot dishes. All our refill dried food is packaged in recyclable paper bags for sustainable living! Vegan

Palm Oil Free

Natural

Plastic Free

Recyclable



"

Goal is to remove the extra tags at the end

In [112]:
# Function to clean the description text
def clean_description(description):
    """
    Removes text after the last end punctuation mark in a string.
    Keeps the original string if no punctuation is found or if input is not a string.
    """
    # handle non-string input
    if not isinstance(description, str):
        return description
    # handle empty string
    if not description.strip():
        return description
    
    # Define the punctuation marks to look for
    punctuation_marks = ".!?"

    # Find the last occurrence of any punctuation mark
    last_punctuation_index = max(description.rfind(mark) for mark in punctuation_marks)
    # If no punctuation mark is found, return the original string
    if last_punctuation_index == -1:
        return description
    # Otherwise, return the substring up to the last punctuation mark
    description = description[:last_punctuation_index + 1]
    # Remove any trailing whitespace
    description = description.rstrip()

    return description

df['description'] = df['description'].apply(clean_description)


In [113]:
df.to_csv('products.csv', index=False)