In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from bs4 import BeautifulSoup
import requests
import re
import itertools
import time
from multiprocessing.pool import Pool
from multiprocessing import current_process
from functools import partial
import csv
import concurrent.futures
from tqdm import tqdm



# Main Functions

In [2]:
def brand_sort(all_brand_url):
    # Input: Page with A-Z of brands
    # Output: List of URLs of all brands
    
    options = webdriver.ChromeOptions()
    options.add_argument('--incognito')
    driver = webdriver.Chrome(options=options)
    driver.get(all_brand_url)
        
    elems = driver.find_elements_by_css_selector(".vqk6pTa [href]")
    brands_list = [elem.get_attribute('href') for elem in elems]

    driver.quit()
            
    return brands_list

In [3]:
def brand_itemize(brand_url):
    # Input: URL of each brand
    # Output: List of URLs of all items in each brand
    
    global brand_dict
    HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }
    content = requests.get(brand_url, headers=HEADERS)
    soup = BeautifulSoup(content.text, 'html.parser')
    brand_name = soup.find('div', {"class":"_3FyxBKb"}).h1.get_text()
    

    attempt = 1 
    if brand_name not in brand_dict:
        options = webdriver.ChromeOptions()
        options.add_argument('--incognito')
        driver = webdriver.Chrome(options=options)
        driver.get(brand_url)
        # If more than one page, click "LOAD MORE"
        try:
            WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "fWxiz1Y")))
        except:
            pass
        while True:
            try:
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='plp']/div/div/div[2]/div/a"))).click()
            except TimeoutException:
                break
            except StaleElementReferenceException:
                if attempt == 3:
                    raise
                attempt += 1
        items = driver.find_elements_by_css_selector("._3pQmLlY [href]")
        brand_items = [item.get_attribute('href') for item in items]

        brand_dict[brand_name] = brand_items
        driver.quit()
    
    return brand_dict

In [4]:
def asos_scraper(asos_url_list, url_list, n):
  
    # Input: URL from Asos
    # Output: DF with results
    
    # Define empty lists to store results and log of failed URLs
    failed = []
    results = []
    
    
    for asos_url in tqdm(asos_url_list):
        if asos_url not in url_list:
            # Empty dictionary to store output
            asos_results = {}

            # Beautiful soup driver
            HEADERS = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4238.2  Safari/537.36',
            }
            content = requests.get(asos_url, headers=HEADERS)
            soup = BeautifulSoup(content.text, 'html.parser')

            # Selenium driver
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4238.2  Safari/537.36'
            path = '/Users/anu/going_headless/chromedriver_mac64/chromedriver'
            options = webdriver.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument("--window-size=1920,1080")
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--allow-running-insecure-content')
            options.add_argument(f'user-agent={user_agent}')
            options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
            driver = webdriver.Chrome(executable_path = path, options=options)
            driver.get(asos_url)

            # Check if product is in stock
            try:
                outofstock = driver.find_element_by_xpath('//*[@id="oos-label"]/h3').text
                if NoSuchElementException:
                    pass
                if outofstock=='OUT OF STOCK':
                    failed.append(asos_url)
                    pass

                else:
                    # Name of product
                    asos_name = soup.find('div', {"class":"product-hero"}).h1.get_text()
                    asos_results['Name'] = asos_name

                    # Material of product
                    asos_aboutme = soup.find('div', {"class":"about-me"})
                    asos_info = asos_aboutme.get_text().strip("\n").strip(".").split(": ")[1:]
                    asos_res = []
                    for info in asos_info:
                        asos_res.append(info.split(", "))
                    asos_results['Material'] = list(itertools.chain.from_iterable(asos_res))

                    # Color of product
                    asos_color = driver.find_element_by_xpath('//*[@id="product-colour"]/section/div/div/span').text
                    asos_results['Color'] = asos_color

                    # URL of product
                    asos_results['URL'] = asos_url

                    # Image tag of product
                    asos_image = driver.find_element_by_xpath('//*[@id="product-gallery"]/div[1]/div[2]/div[2]/div/div/div/div[1]/div[1]/div[3]/div/div/img').get_attribute('src')        
                    asos_results['Image'] = asos_image
                    
                    # Description of product
                    asos_description = []
                    for description in soup.find('div', {"class":"product-description"}).find_all('li'):
                        asos_description.append(description.text)
                    asos_results['Description'] = asos_description
                    
                    # Add dict output to list
                    results.append(asos_results)

            except (NoSuchElementException, AttributeError):
                if NoSuchElementException:
                    failed.append(asos_url)
                elif AttributeError:
                    failed.append(asos_url)
                pass
            
            # Write out to file to prevent filling memory
            if len(results) >= n:
                f = open("asos_table.csv", "a")
                writer = csv.DictWriter(
                    f, fieldnames=['Name', 'Material', 'Color', 'URL', 'Image', 'Description'])
                writer.writeheader()
                writer.writerows(results)
                f.close()
                results = []
            if len(failed) >= 10:
                with open("failed.txt", 'a') as ft:
                    for row in failed:
                        ft.write(str(row) + '\n')
                failed = []  

    if len(results) > 0:            
        f = open("asos_table.csv", "a")
        writer = csv.DictWriter(
            f, fieldnames=['Name', 'Material', 'Color', 'URL', 'Image', 'Description'])
        writer.writeheader()
        writer.writerows(results)
        f.close()
        results = []
    if len(failed) > 0:
        with open("failed.txt", 'a') as ft:
            for row in failed:
                ft.write(str(row) + '\n')
        failed = []
        
#     driver.quit()

### Scraping by Brand

In [5]:
with open('brand_dict.json', 'w') as bm:
    bm.write(json.dumps(brand_dict_men))

NameError: name 'brand_dict_men' is not defined

In [None]:
brand_dict = {}

In [None]:
women_url = "https://www.asos.com/us/women/a-to-z-of-brands/cat/?cid=1340&nlid=ww|brands|top+brands"
brands_women = brand_sort(women_url)

In [None]:
len(brands_women)

In [None]:
for brand_url in brands_women:
    brand_dict_women = brand_itemize(brand_url)

In [None]:
men_url = "https://www.asos.com/us/men/a-to-z-of-brands/cat/?cid=1361&nlid=mw|brands|top+brands"
brands_men = brand_sort(men_url)

In [None]:
len(brands_men)

In [None]:
for brand_url in brands_men:
    brand_dict_men = brand_itemize(brand_url)

In [None]:
# Save all brands as text
all_brands = brands_women + brands_men
with open("brands.txt", 'w') as b:
    for row in all_brands:
        b.write(str(row) + '\n')

In [None]:
# Save all brand+items in json 
with open('brand_dict.json', 'w') as bm:
    bm.write(json.dumps(brand_dict_men))

# Scraper

In [None]:
# Load all brand+items
with open('brand_dict.json') as l:
    brand_dict = json.load(l)

In [None]:
items_all = []
brands = list(brand_dict.keys())
for i in range(len(brands)):
    items_all.append(brand_dict[brands[i]])

In [None]:
'''Pooling to optimize'''
def chunkify(lst, n):
    """builds generator for dividing input lst into n chunks"""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
len(items_all)

584

In [None]:
k = 0
for i in range(0,len(items_all)):
    k += len(items_all[i])

In [None]:
k

107282

In [None]:
len(items_all[17])

18810

In [None]:
new = items_all[17][2000:2100]
len(new)

100

In [None]:
failed = open('failed.txt','r').readlines()
len(failed)

3603

In [None]:
asos_df = pd.read_csv('asos_table.csv')
asos_df

Unnamed: 0,Name,Material,Color,URL,Image,Description
0,Abercrombie & Fitch v neck t-shirt,"['60% Cotton', '40% Viscose']",Black,https://www.asos.com/us/abercrombie-fitch/aber...,https://images.asos-media.com/products/abercro...,"['For your daytime thing', 'V-neck', 'Short sl..."
1,Abercrombie & Fitch deep v neck t-shirt,"['60% Cotton', '40% Viscose']",Grey,https://www.asos.com/us/abercrombie-fitch/aber...,https://images.asos-media.com/products/abercro...,['Some serious daytime inspiration right here'...
2,Abercrombie & Fitch eyelash knit slim v-neck s...,"['59% Nylon', '41% Acrylic']",Cream,https://www.asos.com/us/abercrombie-fitch/aber...,https://images.asos-media.com/products/abercro...,"['Cozy never looked so good', 'V-neck', 'Ribbe..."
3,Abercrombie & Fitch longline cozy high neck sw...,"['70% Acrylic', '14% Nylon', '8% Viscose', '8%...",Medium brown,https://www.asos.com/us/abercrombie-fitch/aber...,https://images.asos-media.com/products/abercro...,"['Cozy never looked so good ', 'Roll-neck', 'D..."
4,Abercrombie & Fitch longline cozy high neck sw...,"['70% Acrylic', '14% Nylon', '8% Viscose', '8%...",Charcoal,https://www.asos.com/us/abercrombie-fitch/aber...,https://images.asos-media.com/products/abercro...,"['Cozy never looked so good ', 'Roll-neck', 'D..."
...,...,...,...,...,...,...
19930,Name,Material,Color,URL,Image,Description
19931,ASOS DESIGN 3 pack face covering in polka dot ...,"['100% Cotton', 'Fabric 2', '99% Cotton', '1% ...",Multi,https://www.asos.com/us/asos-design/asos-desig...,https://images.asos-media.com/products/asos-de...,"['Pack of three', 'Mixed pattern prints ', 'Pl..."
19932,ASOS DESIGN cropped shirt in rust,['100% Viscose'],Rust,https://www.asos.com/us/asos-design/asos-desig...,https://images.asos-media.com/products/asos-de...,"['File under: goes with everything ', 'Plain d..."
19933,ASOS DESIGN tracksuit oversized sweat / jogger...,"['55% Polyester', '45% Cotton', 'Trim', '88% C...",Gray marl,https://www.asos.com/us/asos-design/asos-desig...,https://images.asos-media.com/products/asos-de...,"['Co-ordinating made easy', 'Tracksuit set', '..."


In [None]:
url_list = [i for i in asos_df['URL']]
len(url_list)

19935

In [None]:
scraper_partial = partial(asos_scraper,url_list=url_list, n=50)
chunked_basket = list(chunkify(new, 4))
with Pool(4) as p:
    p.map(scraper_partial, chunked_basket)

100%|██████████| 4/4 [00:40<00:00, 10.07s/it]
100%|██████████| 4/4 [00:42<00:00, 10.50s/it]
100%|██████████| 4/4 [00:44<00:00, 11.10s/it]
100%|██████████| 4/4 [00:48<00:00, 12.00s/it]
100%|██████████| 4/4 [00:42<00:00, 10.74s/it]
100%|██████████| 4/4 [00:45<00:00, 11.26s/it]
100%|██████████| 4/4 [00:48<00:00, 12.16s/it]
100%|██████████| 4/4 [00:48<00:00, 12.01s/it]
100%|██████████| 4/4 [00:42<00:00, 10.54s/it]
100%|██████████| 4/4 [00:51<00:00, 12.93s/it]
100%|██████████| 4/4 [00:46<00:00, 11.60s/it]
100%|██████████| 4/4 [00:45<00:00, 11.30s/it]
100%|██████████| 4/4 [00:38<00:00,  9.58s/it]
100%|██████████| 4/4 [00:44<00:00, 11.01s/it]
100%|██████████| 4/4 [00:49<00:00, 12.50s/it]
100%|██████████| 4/4 [00:45<00:00, 11.48s/it]
100%|██████████| 4/4 [00:47<00:00, 11.95s/it]
100%|██████████| 4/4 [00:46<00:00, 11.58s/it]
100%|██████████| 4/4 [00:52<00:00, 13.22s/it]
100%|██████████| 4/4 [00:44<00:00, 11.13s/it]
100%|██████████| 4/4 [00:40<00:00, 10.01s/it]
100%|██████████| 4/4 [00:43<00:00,