# AllSisters Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from random import randint
import time, urllib, json, unicodedata
from tqdm import tqdm

# Headers to imitate browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Mobile Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Referer': 'https://www.google.co.uk',
    'Upgrade-Insecure-Requests': '1'
}



In [2]:
main_URL = 'https://allsisters.com'
URL_extension_for_products = '/products.json?limit=250&page='
current_page_number = 1

all_URLs_and_JSON = {}
current_product_list = None

while current_product_list != []:
    # make the full json url
    url = main_URL + URL_extension_for_products + str(current_page_number)
    #print(url)
    
    # Get the page
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as request:
        s = request.read()
        response = json.loads(s)
        current_product_list = response['products']
    
    time.sleep(randint(4,6))
    
    # construct the URL
    for product in current_product_list:
        product_url = main_URL + '/products/' + product['handle']
        all_URLs_and_JSON[product_url] = product
        
    current_page_number += 1

In [3]:
print(len(all_URLs_and_JSON))
#all_URLs_and_JSON

130


In [4]:
def find_product_name(soup, product_JSON):
    return product_JSON['title'].strip()

def find_material(soup, product_JSON):
    materials = {}
    materials_container = soup.find('div', id = 'desc').find_all('p')
    materials_container = [item.text.strip() for item in materials_container if item.text.strip() != '']
    materials_list = []
    
    for item in materials_container:
        print(item)
        if item[0].isdigit():
            materials_list = item.split('-')

    materials_list = [item.strip().replace('%', '% &ND') for item in materials_list]
    materials_split = [item.split('&ND') for item in materials_list]

    for item in materials_split:
        materials[item[1].strip().lower()] = item[0].strip()
            
    return materials

def find_color(soup, product_JSON, the_description):
    
    product_title = product_JSON['title'].strip().lower()
    print(product_title)
    
    if 'black' in product_title:
        color = ['black']
    elif 'white' in product_title:
        color = ['white']
    elif 'colors' in product_title:
        color = ['multi-colored']
    elif len(product_JSON['options']) == 2:
        if product_JSON['options'][1]['name'] == 'Color':   
            color_container = product_JSON['options'][1]
            if color_container['name'] == 'Color':
                color = color_container['values']
                color = [item.lower() for item in color]
    elif 'black' and 'white' in the_description[0].lower():
        color = ['black and white']
    elif 'black' in the_description[0].lower():
        color = ['black']
    elif 'white' in the_description[0].lower():
        color = ['white']
    else: 
        color = None
        
    return color

def find_price(soup, product_JSON):
    price_text = soup.find('h2', id = 'price-preview').text.strip().split(' ')
    price = (price_text[0], price_text[1])
    return price

def find_size(soup, product_JSON):
    return [i.text.strip() for i in soup.find('div', {'class': 'swatch-variant product-type-items swatch clearfix'}).find_all('div')][1:]

def find_image(soup, product_JSON):
    image_urls_container = product_JSON['images']
    image_urls = [item['src'] for item in image_urls_container]
    return image_urls

def find_brand(soup, product_JSON):
    return product_JSON['vendor']

def find_description(soup, product_JSON):
    description_container = soup.find('div', id = 'desc').find_all('p')
    description = []
    for item in description_container:
        if item.text[0].isdigit():
            break
        else:
            description_section = item.text.strip()
            if description_section != '':
                description.append(unicodedata.normalize("NFKD", description_section))
    return description

In [5]:
def scrape_single_product(URL, html_soup, product_JSON):
    product = {}
    
    the_description = find_description(html_soup, product_JSON)

    product['Name'] = find_product_name(html_soup, product_JSON)
    product['Material'] = find_material(html_soup, product_JSON)
    product['Color'] = find_color(html_soup, product_JSON, the_description)
    product['Price'] = find_price(html_soup, product_JSON)
    product['Size'] = find_size(html_soup, product_JSON)
    product['URL'] = URL
    product['Image'] = find_image(html_soup, product_JSON)
    product['Brand_name'] = find_brand(html_soup, product_JSON)
    product['Description'] = the_description
    
    return product

In [6]:
def is_available(product_JSON):
    variants = product_JSON['variants']
    
    is_available = False
    
    for variant in variants:
        availability = variant['available']
        if availability == True:
            is_available = True
    
    return is_available

In [7]:
# These 3 cells scrape a single product
# URL_to_scrape = 'https://allsisters.com/products/serena-leggins'
# page = requests.get(URL_to_scrape, headers = headers)
# single_soup = bs(page.content, 'html.parser')

# product_as_json = None
# URL_to_scrape_JSON = URL_to_scrape + ".json"

# single_req = urllib.request.Request(URL_to_scrape_JSON, headers=headers)
# with urllib.request.urlopen(single_req) as request:
#     s = request.read()
#     response = json.loads(s)
#     product_as_json = response['product']
#     #print(product_as_json)

In [8]:
# singleProductResult  = [scrape_single_product(URL_to_scrape, single_soup, product_as_json)]

# #print(singleProductResult)

# product_data_frame = pd.DataFrame(data = singleProductResult,)

# product_data_frame.to_csv('single_item.csv', encoding='utf-8-sig')

# pd.DataFrame(data = singleProductResult)

In [9]:
all_products = []
failed_urls = []
unavailable_products = []

# counter = 0

for url, the_JSON in tqdm(all_URLs_and_JSON.items()):
    try:
       # print(url)
        if is_available(the_JSON):
            #print(is_available(the_JSON))
            page = requests.get(url, headers = headers)
            soup = bs(page.content, 'html.parser')

            product = scrape_single_product(url, soup, the_JSON)
            #print(product)
            if product['Material']:
                all_products.append(product)
        else:
            unavailable_products.append(url)
        time.sleep(randint(2,5))
    except:
        failed_urls.append(url)
    
#     if counter == 20:
#         break
#     counter += 1

# print('\nSummary\n')
# print(len(failed_urls), 'Failed URLs: ', failed_urls)
# print(len(unavailable_products), 'Unavailable Products: ', unavailable_products)
# print('Total products collected: ', len(all_products))
# print('Done!')

100%|██████████| 130/130 [01:30<00:00,  1.44it/s]


In [10]:
fail_again_URLs = []
for url in failed_urls:
    try:
        req = urllib.request.Request(url, headers=headers)
        
        with urllib.request.urlopen(req) as request:
            page = request.read()
            soup = bs(page, 'html.parser')
            all_products.append(scrape_single_product(url, soup))
            
            time.sleep(randint(5,15))
            
    except:
        fail_again_URLs.append(url)

In [11]:
product_data_frame = pd.DataFrame(data = all_products)
product_data_frame.to_csv('allsisters_table.csv', encoding='utf-8-sig')
pd.DataFrame(data = all_products)