# ABCH.world Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from random import randint
import time, urllib, json, unicodedata

In [2]:
# Headers to imitate browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Mobile Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Referer': 'https://www.google.co.uk',
    'Upgrade-Insecure-Requests': '1'
}

In [3]:
main_URL = 'https://abch.world'
URL_extension_for_products = '/products.json?limit=250&page='
current_page_number = 1


all_URLs_and_JSON = {}
current_product_list = None


while current_product_list != []:
    # make the full json url
    url = main_URL + URL_extension_for_products + str(current_page_number)
    print(url)
    
    # Get the page
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as request:
        s = request.read()
        response = json.loads(s)
        current_product_list = response['products']
    
    time.sleep(randint(4,6))
    
    # construct the URL
    for product in current_product_list:
        product_url = main_URL + '/products/' + product['handle']
        all_URLs_and_JSON[product_url] = product
        
    current_page_number += 1

https://abch.world/products.json?limit=250&page=1
https://abch.world/products.json?limit=250&page=2


In [4]:
print(len(all_URLs_and_JSON))
print(all_URLs_and_JSON)

128
{'https://abch.world/products/un-dyed-flannel-split-shirt': {'id': 3592975155305, 'title': 'A.24', 'handle': 'un-dyed-flannel-split-shirt', 'body_html': '<p>SIZE_TABLE_START</p>\n<p class="p1"><span class="s1">Juliette is 5\'8", wears size XS and usually wears Australian women\'s\xa0size 6.<br> </span></p>\n<meta charset="utf-8">\n<p class="p1">A.24 measurements in cm. Please allow a 5% tolerance.</p>\n<table width="100%">\n<tbody>\n<tr style="height: 36px;">\n<td style="height: 36px;">\n<p>Size</p>\n</td>\n<td style="height: 36px;">\xa0XS</td>\n<td style="height: 36px;">S</td>\n<td style="height: 36px;">M</td>\n<td style="height: 36px;">L</td>\n<td style="height: 36px;">\n<p>XL</p>\n</td>\n<td style="height: 36px;">\n<p>XXL</p>\n</td>\n</tr>\n<tr style="height: 36px;">\n<td style="height: 36px;">\n<p class="p1">Circumference -\xa0Chest</p>\n</td>\n<td style="height: 36px;">96</td>\n<td style="height: 36px;">102</td>\n<td style="height: 36px;">108</td>\n<td style="height: 36px;">11

In [5]:
def find_product_name(soup, product_JSON):
    
    product_title_container = soup.find('div', class_ = 'product__content__intro').find('div')
    
    product_title = product_title_container.find('p')
    
    spans = product_title.find_all('span')
    
    for span in spans:
        span.extract()
    
    product_title = product_title.text.replace('\n', ' ')

    product_title_pieces = product_title.split(' ')

    product_title_pieces = [item.strip() for item in product_title_pieces if item != '']
            
    product_title_text = ' '.join(product_title_pieces)
    
    return product_title_text.strip()

In [6]:
def find_material(soup, product_JSON):
    
    materials = {}
    
    fibre = product_JSON['options'][2]
    
    if fibre['name'] == 'Fibre' or fibre['name'] == 'Material':
        materials[fibre['values'][0]] = '100%'

    return materials

In [7]:
def find_color(soup, product_JSON):
    
    color = soup.find('label', class_ = 'radio Color').text.strip().lower()
    
    return color

In [8]:
def find_price(soup, product_JSON):

    price_container = soup.find('div', class_ = 'product__content__intro').find('div')
        
    price_text = price_container.find('span', class_ = 'product-price__price').text.strip()
    
    if price_text[0] == '$':
        price_text = price_text[1:] 
    
    price = ('AUD', price_text)
    
    return price

In [9]:
def find_image(soup, product_JSON):
    
    image_url = product_JSON['images'][0]['src']

    return image_url

In [10]:
def find_brand(soup, product_JSON):
    
    brand = product_JSON['vendor']
    
    return brand

In [11]:
def find_description(soup, product_JSON):
    
    description_container = soup.find('span', string = 'Description').parent.next_sibling
    
    description = description_container.next_sibling.text
    
    description = [unicodedata.normalize("NFKD", description)]
    
    if description == ['']:
        description = description_container.next_sibling.next_sibling.next_sibling.text
        description = [unicodedata.normalize("NFKD", description)]
    
    return description

In [12]:
def find_category(soup, product_JSON):
    
    category = [product_JSON['product_type'].strip().lower()]
    
    return category

In [13]:
def scrape_single_product(URL, html_soup, product_JSON):
    product = {}
    
    # The price needs to be extracted first before the name. 
    # The name methood extracts (removes) the price from the HTMl so it cannot be retrieved again.
    price = find_price(html_soup, product_JSON)
#     print('got price')
    
#     print('got name')
    product['Name'] = find_product_name(html_soup, product_JSON)
    
#     print('getting material')
    product['Material'] = find_material(html_soup, product_JSON)
    
#     print('getting color')
    product['Color'] = find_color(html_soup, product_JSON)
        
#     print('getting price')
    product['Price'] = price
    
#     print('getting URL')
    product['URL'] = URL

#     print('getting image')
    product['Image'] = find_image(html_soup, product_JSON)
    
#     print('getting brand name')
    product['Brand_name'] = find_brand(html_soup, product_JSON)
    
#     print('getting description')
    product['Description'] = find_description(html_soup, product_JSON)
    
#     print('getting category')
    product['Category'] = find_category(html_soup, product_JSON)
    
    return product

In [14]:
def is_available(product_JSON):
    variants = product_JSON['variants']
    
    is_available = False
    
    for variant in variants:
        availability = variant['available']
        if availability == True:
            is_available = True
    
    return is_available

In [15]:
# These 3 cells scrape a single product
# URL_to_scrape = 'https://abch.world/products/a-32'
# page = requests.get(URL_to_scrape, headers = headers)
# single_soup = bs(page.content, 'html.parser')

In [16]:
# product_as_json = None
# URL_to_scrape_JSON = URL_to_scrape + ".json"

# single_req = urllib.request.Request(URL_to_scrape_JSON, headers=headers)
# with urllib.request.urlopen(single_req) as request:
#     s = request.read()
#     response = json.loads(s)
#     product_as_json = response['product']

In [17]:
# singleProductResult  = [scrape_single_product(URL_to_scrape, single_soup, product_as_json)]

# print(singleProductResult)

# product_data_frame = pd.DataFrame(data = singleProductResult,)

# product_data_frame.to_csv('single_item.csv', encoding='utf-8-sig')

# pd.DataFrame(data = singleProductResult)

In [18]:
all_products = []
failed_urls = []
unavailable_products = []

# counter = 0

for url, the_JSON in all_URLs_and_JSON.items():
    try:

        print(url)
        
        if is_available(the_JSON):
            page = requests.get(url, headers = headers)
            soup = bs(page.content, 'html.parser')

            product = scrape_single_product(url, soup, the_JSON)
            print(product)
            all_products.append(product)
            
        else:
            unavailable_products.append(url)

        time.sleep(randint(5,15))
  
    except:
        failed_urls.append(url)
    
#     if counter == 2:
#         break
#     counter += 1

print('\nSummary\n')
print(len(failed_urls), 'Failed URLs: ', failed_urls)
print(len(unavailable_products), 'Unavailable Products: ', unavailable_products)
print('Total products collected: ', len(all_products))
print('Done!')

https://abch.world/products/un-dyed-flannel-split-shirt
https://abch.world/products/organic-cotton-sleeveless-skivvy-sage
https://abch.world/products/organic-cotton-long-sleeve-skivvy-dress-sage
https://abch.world/products/grey-sweater-tee-zero-waste
https://abch.world/products/classic-cotton-fleece-sweater-sage
https://abch.world/products/a-00
{'Name': 'A.BCH Dust Mask', 'Material': {'Organic Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '33'), 'URL': 'https://abch.world/products/a-00', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A00-Dust-Mask-Black-2jpgweba_32b78c13-bce1-4282-99b8-3366f1b95789.jpg?v=1609674754', 'Brand_name': 'A.BCH', 'Description': ["Our 2 ply 100% organic cotton mask is crafted from a thick, rib outer and light jersey inner. We feel it's the right balance of coverage and breathability for extended wear for civilian/ community protection. Ties comfortably around the head, taking pressure off the ears and finished with a bendable wire nose."

https://abch.world/products/organic-cotton-mock-neck-skivvy-undyed
{'Name': 'Mock Neck Skivvy', 'Material': {'Organic Cotton': '100%'}, 'Color': 'undyed', 'Price': ('AUD', '95'), 'URL': 'https://abch.world/products/organic-cotton-mock-neck-skivvy-undyed', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A36-Mens-Skivvy-Undyed-A32-Linen-Front-2jpgwebbu_c3fa287e-0133-41ec-bfb3-6baeb26b7768.jpg?v=1604273549', 'Brand_name': 'portrait-large', 'Description': ["Our all-time favourite skivvy's older brother - with a mock neck in Undyed. The A.36 features an elongated raw hem and sleeves and is made from our signature dye-free GOTS organic cotton rib. Available in 4 colours, the A.36 is your new go-to shirt."], 'Category': ['skivvy']}
https://abch.world/products/organic-cotton-mock-neck-skivvy-cherry
{'Name': 'Mock Neck Skivvy', 'Material': {'Organic Cotton': '100%'}, 'Color': 'cherry', 'Price': ('AUD', '95'), 'URL': 'https://abch.world/products/organic-cotton-mock-neck-skivvy-che

https://abch.world/products/organic-cotton-long-sleeve-skivvy-dress-grey-marle
{'Name': 'Long Sleeve Skivvy Dress', 'Material': {'Organic Cotton': '100%'}, 'Color': 'grey marle', 'Price': ('AUD', '150'), 'URL': 'https://abch.world/products/organic-cotton-long-sleeve-skivvy-dress-grey-marle', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A14-Dress-Grey-LS-FrontSML.jpg?v=1601519656', 'Brand_name': 'A.BCH', 'Description': ['The oh-so-elegant A.14 long sleeve skivvy dress is endlessly comfortable, cosy and sophisticated. This cut is fairly relaxed, so for a more fitted look, order down a size. Made from our signature organic, softly structured rib in dreamy grey marle.'], 'Category': ['dress']}
https://abch.world/products/organic-cotton-long-sleeve-skivvy-dress-black
{'Name': 'Long Sleeve Skivvy Dress', 'Material': {'Organic Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '150'), 'URL': 'https://abch.world/products/organic-cotton-long-sleeve-skivvy-dress-black', 'Imag

https://abch.world/products/unisex-everyday-joggers-organic-cotton-black-mens
{'Name': 'Everyday Joggers', 'Material': {'Organic Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '240'), 'URL': 'https://abch.world/products/unisex-everyday-joggers-organic-cotton-black-mens', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A09-Sweater-Grey-Marle-Front-A32-OC-Jogger-Front-3jpgwebh.jpg?v=1602674079', 'Brand_name': 'A.BCH', 'Description': ['Comfy, cool and oh-so-cosy, the Everyday Joggers are constructed from our signature organic cotton french terry. The unisex cut is slightly dropped through the crotch so they can be worn on the waist or hips. Featuring a biodegradable elasticated waistband, false fly front and double cord drawstring.'], 'Category': ['joggers']}
https://abch.world/products/unisex-every-day-jogger-black-organic-cotton
{'Name': 'Everyday Joggers', 'Material': {'Organic Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '240'), 'URL': 'https://abch.world/

https://abch.world/products/super-crop-tshirt-australian-supply-chain-cotton
{'Name': 'Super Crop T-shirt', 'Material': {'Australian Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '95'), 'URL': 'https://abch.world/products/super-crop-tshirt-australian-supply-chain-cotton', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A10_Olive_A30_Black-Tshirt-FrontSML.jpg?v=1601514359', 'Brand_name': 'A.BCH', 'Description': ['The ASC Super Crop T-Shirt is a lush double jersey with a longer back, extra cropped front and gathered long side ties which can be worn out and down, wrapped around the body or tied loosely in front. The fabric is unique with a fully Australian supply chain from cultivation to construction.'], 'Category': ['t-shirt']}
https://abch.world/products/relaxed-hemp-shorts-cream
{'Name': 'Relaxed Hemp Shorts', 'Material': {'Hemp': '100%'}, 'Color': 'ivory', 'Price': ('AUD', '360'), 'URL': 'https://abch.world/products/relaxed-hemp-shorts-cream', 'Image': 'https://

https://abch.world/products/unisex-bolt-plaid-apron-skirt-black
{'Name': 'Bolt Plaid Skirt', 'Material': {'Organic Cotton': '100%'}, 'Color': 'grey/black plaid', 'Price': ('AUD', '355'), 'URL': 'https://abch.world/products/unisex-bolt-plaid-apron-skirt-black', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A25-Bolt-Plaid-Skirt-Mens-LR.jpg?v=1601515762', 'Brand_name': 'portrait-large', 'Description': ['Our limited edition take on a unisex skirt, made to be worn over pants (or not). The design honours Japanese plaid fabrication, featuring deep box pleats at the front, darts in the back, fastened with a fitted double corozo button waistband. The two overlapping side splits make this apron-esque skirt both a statement and a staple.'], 'Category': ['skirt']}
https://abch.world/products/unisex-black-bolt-plaid-apron-skirt
https://abch.world/products/black-tencel-lounge-tie-shorts
{'Name': 'Tencel Lounge Shorts', 'Material': {'Lenzing Tencel': '100%'}, 'Color': 'black', 'Price

{'Name': 'Raw Denim Signature Shorts', 'Material': {'Organic Cotton': '100%'}, 'Color': 'dark indigo', 'Price': ('AUD', '195'), 'URL': 'https://abch.world/products/unisex-raw-denim-signature-shorts', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/ABCH_A16_Shorts_Denim_Front.jpg?v=1601517159', 'Brand_name': 'A.BCH', 'Description': ['Denim, like cookie dough, is best served raw. Un-washed and untreated to minimise industrial water usage, the signature shorts are a structured cut with a paper bag style waist, mock fly front and real (deep) pockets. Featuring contrast stitching and an A.16 label sitting proudly on the rear left. Wash before wear to avoid dye transfer.'], 'Category': ['shorts']}
https://abch.world/products/signature-shorts-in-raw-denim
{'Name': 'Raw Denim Signature Shorts', 'Material': {'Organic Cotton': '100%'}, 'Color': 'dark indigo', 'Price': ('AUD', '195'), 'URL': 'https://abch.world/products/signature-shorts-in-raw-denim', 'Image': 'https://cdn.shopify.

https://abch.world/products/organic-cotton-long-sleeve-skivvy-grey-marle
{'Name': 'Long Sleeve Skivvy', 'Material': {'Organic Cotton': '100%'}, 'Color': 'grey marle', 'Price': ('AUD', '90'), 'URL': 'https://abch.world/products/organic-cotton-long-sleeve-skivvy-grey-marle', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A14_LS_Grey_Rib_Skivvy_side.jpg?v=1601520596', 'Brand_name': 'portrait-large', 'Description': ['Drama club sans the drama. The A.14 is a fan favourite for its timeless appeal and flattering cut. The long sleeve skivvy is cosy with a relaxed fit and raw hems - perfect for layering. Made from our signature organic, softly structured rib in a cushy grey marle.'], 'Category': ['skivvy']}
https://abch.world/products/organic-cotton-short-sleeve-skivvy-grey-marle
{'Name': 'Short Sleeve Skivvy', 'Material': {'Organic Cotton': '100%'}, 'Color': 'grey marle', 'Price': ('AUD', '85'), 'URL': 'https://abch.world/products/organic-cotton-short-sleeve-skivvy-grey-marle',

{'Name': 'Signature Sweater', 'Material': {'Organic Cotton': '100%'}, 'Color': 'grey marle', 'Price': ('AUD', '115'), 'URL': 'https://abch.world/products/unisex-organic-cotton-signature-sweater-grey-marle', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/A09-Grey-Sweater_A32-Jogger-OC-Terry-Side-CroppedSML.jpg?v=1601550521', 'Brand_name': 'A.BCH', 'Description': ['Our signature pullover sweatshirt constructed from GOTS organic yarn-dyed french terry. The cut is relaxed through the neckline and width, coupled with a raw, wrist-bone length sleeve, perfect for layering.'], 'Category': ['sweater']}
https://abch.world/products/unisex-organic-cotton-signature-sweater-black
{'Name': 'Signature Sweater', 'Material': {'Organic Cotton': '100%'}, 'Color': 'black', 'Price': ('AUD', '115'), 'URL': 'https://abch.world/products/unisex-organic-cotton-signature-sweater-black', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/ABCH_Shoot2048_c8ab89f6-2aac-432a-b4e8-7280eca5d2

https://abch.world/products/unisex-organic-linen-black-button-up-shirt
{'Name': 'Unisex Button Up Linen Shirt', 'Material': {'Organic Linen': '100%'}, 'Color': 'black', 'Price': ('AUD', '275'), 'URL': 'https://abch.world/products/unisex-organic-linen-black-button-up-shirt', 'Image': 'https://cdn.shopify.com/s/files/1/1637/8509/products/170109_ABCH_06_315.jpg?v=1601551836', 'Brand_name': 'A.BCH', 'Description': ["Elegant in the everyday, this lightweight shirt is made with organic linen from a carbon neutral mill. A relaxed look through the body, longer in the back with added details in the side split and back yoke. Featuring our beautiful tonal corozo buttons under the hidden placket, it's as nice to make as it is to wear. Due to the unisex cut, we recommend men, or those with broad shoulders, to order up one - two sizes."], 'Category': ['shirt']}
https://abch.world/products/organic-raw-denim-tote-edition-bag
https://abch.world/products/womens-organic-linen-white-button-up-shirt
{'Name

In [19]:
# fail_again_URLs = []
# for url in failed_urls:
#     try:

#         req = urllib.request.Request(url, headers=headers)
        
#         with urllib.request.urlopen(req) as request:
#             page = request.read()
#             soup = bs(page, 'html.parser')
#             all_products.append(scrape_single_product(url, soup))
            
#             time.sleep(randint(5,15))
            
#     except:
#         fail_again_URLs.append(url)

In [20]:
product_data_frame = pd.DataFrame(data = all_products)

product_data_frame.to_csv('abchworld_table.csv', encoding='utf-8-sig')

pd.DataFrame(data = all_products)

Unnamed: 0,Name,Material,Color,Price,URL,Image,Brand_name,Description,Category
0,A.BCH Dust Mask,{'Organic Cotton': '100%'},black,"(AUD, 33)",https://abch.world/products/a-00,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Our 2 ply 100% organic cotton mask is crafted...,[dust mask]
1,Linen Joggers,{'Organic Linen': '100%'},black,"(AUD, 310)",https://abch.world/products/a-32,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,"[Constructed from our signature, medium weight...",[joggers]
2,Rib T-shirt,{'Organic Cotton': '100%'},grey marle,"(AUD, 80)",https://abch.world/products/organic-cotton-rib...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[The perfect GOTS organic rib t-shirt in grey ...,[t-shirt]
3,Rib T-shirt,{'Organic Cotton': '100%'},black,"(AUD, 80)",https://abch.world/products/organic-cotton-rib...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[The perfect GOTS organic rib t-shirt in black...,[t-shirt]
4,Rib T-shirt,{'Organic Cotton': '100%'},undyed,"(AUD, 80)",https://abch.world/products/organic-cotton-rib...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[The perfect GOTS organic rib t-shirt in undye...,[t-shirt]
...,...,...,...,...,...,...,...,...,...
105,Unisex Button Up Linen Shirt,{'Organic Linen': '100%'},black,"(AUD, 275)",https://abch.world/products/womens-organic-lin...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,"[Elegant in the everyday, this lightweight shi...",[shirt]
106,Boy Cut T-shirt,{'Organic Cotton': '100%'},black,"(AUD, 75)",https://abch.world/products/organic-cotton-boy...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Luxuriously soft black t-shirt with a traditi...,[t-shirt]
107,Boy Cut T-shirt,{'Organic Cotton': '100%'},white,"(AUD, 75)",https://abch.world/products/organic-cotton-boy...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[Luxurious crisp white t-shirt with a traditio...,[t-shirt]
108,Short Sleeve Denim Shirt,{'Organic Cotton': '100%'},indigo,"(AUD, 220)",https://abch.world/products/mens-organic-raw-d...,https://cdn.shopify.com/s/files/1/1637/8509/pr...,A.BCH,[A classic denim chambray shirt with button do...,[shirt]
