# ABCH.world Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from random import randint
from tqdm import tqdm
import time, urllib, json, unicodedata

# Headers to imitate browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Mobile Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Referer': 'https://www.google.co.uk',
    'Upgrade-Insecure-Requests': '1'
}



In [2]:
main_URL = 'https://abch.world'
URL_extension_for_products = '/products.json?limit=250&page='
current_page_number = 1

all_URLs_and_JSON = {}
current_product_list = None

while current_product_list != []:
    # make the full json url
    url = main_URL + URL_extension_for_products + str(current_page_number)
    print(url)
    
    # Get the page
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as request:
        s = request.read()
        response = json.loads(s)
        current_product_list = response['products']
    
    time.sleep(randint(4,6))
    
    # construct the URL
    for product in current_product_list:
        product_url = main_URL + '/products/' + product['handle']
        all_URLs_and_JSON[product_url] = product
        
    current_page_number += 1

https://abch.world/products.json?limit=250&page=1
https://abch.world/products.json?limit=250&page=2


In [3]:
print(len(all_URLs_and_JSON))
# all_URLs_and_JSON

162


In [4]:
# Find Each Element

def find_product_name(soup, product_JSON):
    product_title_container = soup.find('div', class_ = 'product__content__intro').find('div')
    product_title = product_title_container.find('p')
    spans = product_title.find_all('span')
    for span in spans:
        span.extract()
    product_title = product_title.text.replace('\n', ' ')
    product_title_pieces = product_title.split(' ')
    product_title_pieces = [item.strip() for item in product_title_pieces if item != '']     
    product_title_text = ' '.join(product_title_pieces)
    return product_title_text.strip()

def find_material(soup, product_JSON):
    materials = {}
    fibre = product_JSON['options'][2]
    if fibre['name'] == 'Fibre' or fibre['name'] == 'Material':
        materials[fibre['values'][0]] = '100%'
    return materials

def find_color(soup, product_JSON):
    return [i.text.strip() for i in soup.find_all('label', class_ = 'radio Color')]

def find_price(soup, product_JSON):
    price_container = soup.find('div', class_ = 'product__content__intro').find('div')  
    price_text = price_container.find('span', class_ = 'product-price__price').text.strip()
    if price_text[0] == '$':
        price_text = price_text[1:] 
    return ('AUD', price_text)

def find_size(soup, product_JSON):
    return [i.text.strip() for i in soup.find_all('label', {'class':'radio Size'})]

def find_image(soup, product_JSON):
    return product_JSON['images'][0]['src']

def find_brand(soup, product_JSON):
    return product_JSON['vendor']

def find_description(soup, product_JSON):
    description_container = soup.find('span', string = 'Description').parent.next_sibling
    description = description_container.next_sibling.text
    description = [unicodedata.normalize("NFKD", description)]
    if description == ['']:
        description = description_container.next_sibling.next_sibling.next_sibling.text
        description = [unicodedata.normalize("NFKD", description)]
    return description

In [5]:
def scrape_single_product(URL, html_soup, product_JSON):
    product = {}
    
    # The price needs to be extracted first before the name. 
    # The name methood extracts (removes) the price from the HTMl so it cannot be retrieved again.
    price = find_price(html_soup, product_JSON)
    product['display_name'] = find_product_name(html_soup, product_JSON)
    product['materials'] = find_material(html_soup, product_JSON)
    product['color'] = find_color(html_soup, product_JSON)
    product['price'] = price
    product['size'] = find_size(html_soup, product_JSON)
    product['url'] = URL
    product['image'] = find_image(html_soup, product_JSON)
    product['brand_name'] = find_brand(html_soup, product_JSON)
    product['description'] = find_description(html_soup, product_JSON)
    
    return product

In [6]:
def is_available(product_JSON):
    variants = product_JSON['variants']
    
    is_available = False
    
    for variant in variants:
        availability = variant['available']
        if availability == True:
            is_available = True
    
    return is_available

In [7]:
# These 3 cells scrape a single product
# URL_to_scrape = 'https://abch.world/products/a-32'
# page = requests.get(URL_to_scrape, headers = headers)
# single_soup = bs(page.content, 'html.parser')

In [8]:
# product_as_json = None
# URL_to_scrape_JSON = URL_to_scrape + ".json"

# single_req = urllib.request.Request(URL_to_scrape_JSON, headers=headers)
# with urllib.request.urlopen(single_req) as request:
#     s = request.read()
#     response = json.loads(s)
#     product_as_json = response['product']

In [9]:
# singleProductResult  = [scrape_single_product(URL_to_scrape, single_soup, product_as_json)]

# print(singleProductResult)

# product_data_frame = pd.DataFrame(data = singleProductResult,)

# product_data_frame.to_csv('single_item.csv', encoding='utf-8-sig')

# pd.DataFrame(data = singleProductResult)

In [10]:
def f(url, the_JSON):
    try:
        if is_available(the_JSON):
            page = requests.get(url, headers = headers)
            soup = bs(page.content, 'html.parser')

            product = scrape_single_product(url, soup, the_JSON)
            all_products.append(product)
            
        else:
            unavailable_products.append(url)

        # time.sleep(randint(5,15))
  
    except:
        failed_urls.append(url)

In [11]:
import dask

In [12]:
%%time
all_products = []
failed_urls = []
unavailable_products = []

# counter = 0
jobs = []
for url, the_JSON in tqdm(all_URLs_and_JSON.items()):
    jobs.append(dask.delayed(f)(url, the_JSON))
    
    
dask.compute(*jobs)

    
#     if counter == 2:
#         break
#     counter += 1

print('\nSummary\n')
print(len(failed_urls), 'Failed URLs: ', failed_urls)
print(len(unavailable_products), 'Unavailable Products: ', unavailable_products)
print('Total products collected: ', len(all_products))
print('Done!')

100%|██████████| 162/162 [00:00<00:00, 347.72it/s]



Summary

8 Failed URLs:  ['https://abch.world/products/fan-crop-t-shirt-white-and-grey-marle', 'https://abch.world/products/redline-cutpiece-denim-jacket-organic-cotton-denim', 'https://abch.world/products/unisex-black-bolt-plaid-apron-skirt', 'https://abch.world/products/organic-cotton-string-tote-black', 'https://abch.world/products/organic-cotton-classic-fan-tshirt', 'https://abch.world/products/delicates-bag', 'https://abch.world/products/laundry-bar', 'https://abch.world/products/abch-virtual-gift-card']
12 Unavailable Products:  ['https://abch.world/products/grey-sweater-tee-zero-waste', 'https://abch.world/products/rainbow-gift-card', 'https://abch.world/products/organic-cotton-sleeveless-skivvy-dress-sage', 'https://abch.world/products/signature-shorts-in-raw-denim', 'https://abch.world/products/classic-cotton-fleece-sweater-sage', 'https://abch.world/products/unisex-raw-denim-signature-shorts', 'https://abch.world/products/organic-raw-denim-tote-edition-bag', 'https://abch.wo

In [13]:
fail_again_URLs = []
for url in failed_urls:
    try:
        req = urllib.request.Request(url, headers=headers)
        
        with urllib.request.urlopen(req) as request:
            page = request.read()
            soup = bs(page, 'html.parser')
            all_products.append(scrape_single_product(url, soup))
            
            time.sleep(randint(5,15)) 
    except:
        fail_again_URLs.append(url)

In [14]:
df = pd.DataFrame(data = all_products)
df.to_csv('abchworld_table.csv', encoding='utf-8-sig')
df

Unnamed: 0,display_name,materials,color,price,size,url,image,brand_name,description
0,Short Sleeve Skivvy,{'Organic Cotton': '100%'},[Navy],"(AUD, 85)","[XS, S, M, L, XL, XXL, C]",https://abch.world/products/organic-cotton-sho...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[A fan favourite for its timeless appeal and f...
1,Short Sleeve Skivvy,{'Organic Cotton': '100%'},[Cherry],"(AUD, 85)","[XS, S, M, L, XL, XXL, C]",https://abch.world/products/organic-cotton-sho...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[The A.14 is a fan favourite for its timeless ...
2,Tailored T-shirt,{'Hemp / Organic Cotton': '100%'},[Ivory],"(AUD, 85)","[M, XL]",https://abch.world/products/undyed-tailored-t-...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Limited edition hemp + organic cotton blended...
3,Fleecy Trackpants,{'Organic Cotton': '100%'},[Undyed],"(AUD, 185)","[XS, S, M, L, XL, XXL]",https://abch.world/products/organic-cotton-fle...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,"[Extra cosy, made from a brushed fleecy organi..."
4,Mock Neck Skivvy,{'Organic Cotton': '100%'},"[Cherry, Undyed, Grey Marle, Black, Navy]","(AUD, 95)","[XS, S, M, L, XL, XXL, C]",https://abch.world/products/organic-cotton-moc...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Our all-time favourite skivvy's older sibling...
...,...,...,...,...,...,...,...,...,...
137,Offcut T-shirt,{'Organic Cotton': '100%'},[Grey Marle],"(AUD, 55)","[XS, S, M, L, XL, C]",https://abch.world/products/organic-cotton-zer...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,"[Off-cuts that are on point, this tee is made ..."
138,Fitted T-shirt,{'Organic Cotton': '100%'},[White],"(AUD, 65)","[XS, M, L, XL]",https://abch.world/products/organic-cotton-fit...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Luxuriously soft white t-shirt that's cut for...
139,Long Line T-Shirt Set,{'Organic Cotton': '100%'},[Black + White],"(AUD, 145)","[XS, S, XXL]",https://abch.world/products/organic-cotton-lon...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Luxurious crisp white and black t-shirt set (...
140,Skivvy Set,{'Organic Cotton': '100%'},[Navy],"(AUD, 240)","[XS, XL]",https://abch.world/products/organic-cotton-3-p...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,"[3-Piece Set. One long sleeve, short sleeve an..."


In [15]:
df['brand_name'] = 'A.BCH'