In [1]:
# !pip install requests-html selenium arsenic pandas

##  Sync vs Async

The Chess Game Analogy

Consecutive vs Concurrent

In [1]:
%%time

import time

iteration_times = [1, 3, 2, 4]


def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)


def run():
    for i, second in enumerate(iteration_times):
        sleeper(second, i=i)
    
run()

0	1s
1	3s
2	2s
3	4s
CPU times: user 8.99 ms, sys: 736 µs, total: 9.72 ms
Wall time: 10 s


In [2]:
start = time.time()
iteration_times = [1, 3, 2, 1]
import asyncio

async def a_sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    await asyncio.sleep(seconds) # coroutine
    
    ellap = time.time() - start
    print(f"{i} done {ellap}")
    return "abc"

async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(a_sleeper(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-5' coro=<a_sleeper() running at /tmp/ipykernel_35607/2677998759.py:5>>, <Task pending name='Task-6' coro=<a_sleeper() running at /tmp/ipykernel_35607/2677998759.py:5>>, <Task pending name='Task-7' coro=<a_sleeper() running at /tmp/ipykernel_35607/2677998759.py:5>>, <Task pending name='Task-8' coro=<a_sleeper() running at /tmp/ipykernel_35607/2677998759.py:5>>]
0.0005695819854736328
0	1s
1	3s
2	2s
3	1s


0 done 1.0030004978179932
3 done 1.00321626663208
2 done 2.0025668144226074
1 done 3.0030665397644043


## Blocking & Timeouts

In [3]:
def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)

sleeper(12)

In [4]:
async def asleeper(seconds, i=-1):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.sleep(seconds)
    
await asleeper(12)

In [5]:
print("hello word")

hello word


In [6]:
loop = asyncio.get_event_loop()
# loop = asyncio.new_event_loop()
# aysncio.run()


loop.create_task(asleeper(123))

<Task pending name='Task-10' coro=<asleeper() running at /tmp/ipykernel_35607/3054092388.py:1>>

In [7]:
print("hello word")

hello word


In [8]:
done, pending = await asyncio.wait([asleeper(1), asleeper(123)], timeout=2)
done, pending

({<Task finished name='Task-13' coro=<asleeper() done, defined at /tmp/ipykernel_35607/3054092388.py:1> result=None>},
 {<Task pending name='Task-12' coro=<asleeper() running at /tmp/ipykernel_35607/3054092388.py:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7f9f10562b80>()]>>})

In [9]:
done

{<Task finished name='Task-13' coro=<asleeper() done, defined at /tmp/ipykernel_35607/3054092388.py:1> result=None>}

In [10]:
pending

{<Task pending name='Task-12' coro=<asleeper() running at /tmp/ipykernel_35607/3054092388.py:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7f9f10562b80>()]>>}

In [11]:
try:
    await asyncio.wait_for(asleeper(5), timeout=3)
except asyncio.TimeoutError:
    print("Task failed")

Task failed


In [12]:
async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    
# await asleeper_timeout(12, timeout=1)

## Scraping with Selenium - Synchronous
New to selenium and web scraping? Watch [this series](https://kirr.co/dwy90n).

In [4]:
!pip install requests_html pandas selenium webdriver-manager

You should consider upgrading via the '/home/oswualdo.alquisiris/Documentos/RepoPersonal/venv8/bin/python -m pip install --upgrade pip' command.[0m


In [13]:
url = 'https://www.spoonflower.com/en/shop?on=fabric'

In [14]:
import re
import requests
from requests_html import HTML
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [17]:
def scraper(url):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    return driver.page_source


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']

In [18]:
content = scraper(url)

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [19]:
html_r = HTML(html=content)

fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]

datas = []
for path in fabric_links:
    id_, slug_ = extract_id_slug(path)
    print(id_, slug_)
    data = {
        "id": id_,
        "slug": slug_,
        "path": path,
        "scraped": 0 # True / False -> 1 / 0 
    }
    datas.append(data)

9544489 daisy-print-fabric-daisies-daisy-fabric-baby-fabric-spring-fabric-baby-girl-earthy-tan-by-charlottewinter
11112826 dragon-fire-rainbow-by-adenaj
2143712 rosey-document-by-peagreengirl
11125049 custom-colorway-groovy-mushroom-garden-rusty-orange-by-yesterdaycollection
8737001 chinoiserie-whimsy-blue-white-large-scale-pattern-by-pattern_garden
10569898 safari-new-by-ktscarlett_
10410859 mushroom-garden-linen-by-mypetalpress
9610524 large-le-jardin-art-nouveau-black-by-hnldesigns
8405290 fable-floral-teal-jumbo-by-nouveau_bohemian
11111052 avery-retro-floral-on-white-medium-scale-by-red_raspberry_design
7307966 art-deco-fleurs-d-or-by-j9design
7463028 seamless-watercolor-larger-leaves-pattern-1-by-daily_miracles
8345787 plain-white-solid-white-plain-unprinted-fabric-by-erin__kendal
9432598 modern-retro-floral-1970-muumuu-large-by-hnldesigns
9353509 mustard-mud-cloth-arrow-cross-dot-mudcloth-home-decor-tribal-lad19-by-littlearrowdecor
5372926 light-blue-ocean-by-lauriekentdesigns
2

In [20]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,id,slug,path,scraped
0,9544489,daisy-print-fabric-daisies-daisy-fabric-baby-f...,/en/fabric/9544489-daisy-print-fabric-daisies-...,0
1,11112826,dragon-fire-rainbow-by-adenaj,/en/fabric/11112826-dragon-fire-rainbow-by-adenaj,0
2,2143712,rosey-document-by-peagreengirl,/en/fabric/2143712-rosey-document-by-peagreengirl,0
3,11125049,custom-colorway-groovy-mushroom-garden-rusty-o...,/en/fabric/11125049-custom-colorway-groovy-mus...,0
4,8737001,chinoiserie-whimsy-blue-white-large-scale-patt...,/en/fabric/8737001-chinoiserie-whimsy-blue-whi...,0


In [21]:
df.to_csv("local.csv", index=False)

In [22]:
pd.read_csv("local.csv")

Unnamed: 0,id,slug,path,scraped
0,9544489,daisy-print-fabric-daisies-daisy-fabric-baby-f...,/en/fabric/9544489-daisy-print-fabric-daisies-...,0
1,11112826,dragon-fire-rainbow-by-adenaj,/en/fabric/11112826-dragon-fire-rainbow-by-adenaj,0
2,2143712,rosey-document-by-peagreengirl,/en/fabric/2143712-rosey-document-by-peagreengirl,0
3,11125049,custom-colorway-groovy-mushroom-garden-rusty-o...,/en/fabric/11125049-custom-colorway-groovy-mus...,0
4,8737001,chinoiserie-whimsy-blue-white-large-scale-patt...,/en/fabric/8737001-chinoiserie-whimsy-blue-whi...,0
...,...,...,...,...
79,6625245,sandstone-desert-by-wren_leyland,/en/fabric/6625245-sandstone-desert-by-wren_le...,0
80,10148904,art-deco-swans-navy-12-by-katerhees,/en/fabric/10148904-art-deco-swans-navy-12-by-...,0
81,7748646,emerald-eden-by-catalinakim,/en/fabric/7748646-emerald-eden-by-catalinakim,0
82,12300699,cozy-night-sky-large-full-moon-stars-over-clou...,/en/fabric/12300699-cozy-night-sky-large-full-...,0


## Asynchronous Scraping with `chromedriver` and `arsenic`

[arsenic Docs](https://arsenic.readthedocs.io/en/latest/)

In [45]:
!pip install arsenic

4619.83s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


You should consider upgrading via the '/home/oswualdo.alquisiris/Documentos/RepoPersonal/venv8/bin/python -m pip install --upgrade pip' command.[0m


In [26]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

path = os.getcwd()


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver(binary=path+"/chromedriver")
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


async def store_links_as_df_pickle(datas=[], name='links.pkl'):
    df = pd.DataFrame(datas)
    df.set_index('id', drop=True, inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    df = await store_links_as_df_pickle(links)
    return links
    
if __name__ == "__main__":
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    results = asyncio.run(run(url))
    print(results)


Overwriting async_scrape.py


In [27]:
!python async_scrape.py

Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 57711
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
2022-10-12 11:10.53 [info     ] request                        body={"capabilities": {"alwaysMatch": {"browserName": "chrome"}}} header={'Content-Type': 'application/json'} method=POST url=http://localhost:57711/session
2022-10-12 11:10.53 [info     ] response                       body={"capabilities": {"alwaysMatch": {"browserName": "chrome"}}} data={'value': {'capabilities': {'acceptInsecureCerts': False, 'browserName': 'chrome', 'browserVersion': '106.0.5249.103', 'chrome': {'chromedriverVersion': '106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569})', 'userDataDir': '/tmp/.com.google.Chrome.58B1ix'}, 'goog:chromeOptions': {'debuggerAddress'

In [28]:
name = 'links.pkl'
df = pd.read_pickle(name)
df.head()

Unnamed: 0_level_0,slug,path,scraped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9353509,mustard-mud-cloth-arrow-cross-dot-mudcloth-hom...,/en/fabric/9353509-mustard-mud-cloth-arrow-cro...,0
5372926,light-blue-ocean-by-lauriekentdesigns,/en/fabric/5372926-light-blue-ocean-by-lauriek...,0
8834187,mid-century-kaleidoscope-by-ceciliamok,/en/fabric/8834187-mid-century-kaleidoscope-by...,0
12441406,large-aara-palm-floral-natural-pink-by-scarlet...,/en/fabric/12441406-large-aara-palm-floral-nat...,0
6782514,eame-s-wildflower-meadow-watercolor-floral-bot...,/en/fabric/6782514-eame-s-wildflower-meadow-wa...,0


In [29]:
df.shape

(84, 3)

## Hide `arsenic` Logs

In [29]:
!pip install structlog



In [30]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog

from pathlib import Path

path = os.getcwd()

def abspath(file):
    return str(Path(__file__).parent.absolute() / file)

def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver(binary=path+"/chromedriver")
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


async def store_links_as_df_pickle(datas=[], name='links.pkl'):
    df = pd.DataFrame(datas)
    df.set_index('id', drop=True, inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    df = await store_links_as_df_pickle(links)
    return links
    
if __name__ == "__main__":
    set_arsenic_log_level()
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    results = asyncio.run(run(url))
    print(results)


Overwriting async_scrape.py


In [31]:
!python async_scrape.py

Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 36287
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
[{'id': '8834187', 'slug': 'mid-century-kaleidoscope-by-ceciliamok', 'path': '/en/fabric/8834187-mid-century-kaleidoscope-by-ceciliamok', 'scraped': 0}, {'id': '10569898', 'slug': 'safari-new-by-ktscarlett_', 'path': '/en/fabric/10569898-safari-new-by-ktscarlett_', 'scraped': 0}, {'id': '9610524', 'slug': 'large-le-jardin-art-nouveau-black-by-hnldesigns', 'path': '/en/fabric/9610524-large-le-jardin-art-nouveau-black-by-hnldesigns', 'scraped': 0}, {'id': '9544489', 'slug': 'daisy-print-fabric-daisies-daisy-fabric-baby-fabric-spring-fabric-baby-girl-earthy-tan-by-charlottewinter', 'path': '/en/fabric/9544489-daisy-print-fabric-daisies-daisy-fabric-baby-fabric-spring-fabric-baby-gi

## Async Data with Pandas

In [34]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog

path = os.getcwd()

def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver(binary=path+"/chromedriver")
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    return links
    
if __name__ == "__main__":
    set_arsenic_log_level()
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    name = "link.pkl"
    results = asyncio.run(run(url))
    df = store_links_as_df_pickle(results, name=name)
    print(df.head())


Overwriting async_scrape.py


In [35]:
!python async_scrape.py

Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 44677
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
         id  ... scraped
1   8039248  ...     0.0
2  10569898  ...     0.0
3   1503057  ...     0.0
4   8737001  ...     0.0
5   9792368  ...     0.0

[5 rows x 4 columns]


In [36]:
df = pd.read_pickle("link.pkl")
df.shape

(84, 4)

In [37]:
df.head(n=10)

Unnamed: 0,id,slug,path,scraped
1,8039248,forest-animal-hot-air-balloon-night-adventure-...,/en/fabric/8039248-forest-animal-hot-air-ballo...,0.0
2,10569898,safari-new-by-ktscarlett_,/en/fabric/10569898-safari-new-by-ktscarlett_,0.0
3,1503057,william-morris-strawberry-thief-intense-by-pea...,/en/fabric/1503057-william-morris-strawberry-t...,0.0
4,8737001,chinoiserie-whimsy-blue-white-large-scale-patt...,/en/fabric/8737001-chinoiserie-whimsy-blue-whi...,0.0
5,9792368,japanese-waves-by-sveta_aho,/en/fabric/9792368-japanese-waves-by-sveta_aho,0.0
6,8091225,dark-floral-black-roses-on-black-moody-floral-...,/en/fabric/8091225-dark-floral-black-roses-on-...,0.0
7,9432598,modern-retro-floral-1970-muumuu-large-by-hnlde...,/en/fabric/9432598-modern-retro-floral-1970-mu...,0.0
8,7502677,fable-floral-blush-jumbo-by-nouveau_bohemian,/en/fabric/7502677-fable-floral-blush-jumbo-by...,0.0
9,8834187,mid-century-kaleidoscope-by-ceciliamok,/en/fabric/8834187-mid-century-kaleidoscope-by...,0.0
10,11111052,avery-retro-floral-on-white-medium-scale-by-re...,/en/fabric/11111052-avery-retro-floral-on-whit...,0.0


## Prepare to Scrape Multiple URLs

In [38]:
start = time.time()
iteration_times = [1, 3, 2, 1]


async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    ellap = time.time() - start
    print(f"{i} done {ellap}")


async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(asleeper_timeout(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-17' coro=<asleeper_timeout() running at /tmp/ipykernel_35607/619623914.py:5>>, <Task pending name='Task-18' coro=<asleeper_timeout() running at /tmp/ipykernel_35607/619623914.py:5>>, <Task pending name='Task-19' coro=<asleeper_timeout() running at /tmp/ipykernel_35607/619623914.py:5>>, <Task pending name='Task-20' coro=<asleeper_timeout() running at /tmp/ipykernel_35607/619623914.py:5>>]
0.0005147457122802734
a0	1s
a1	3s
a2	2s
a3	1s


0 done 1.0053787231445312
3 done 1.0055482387542725
2 done 2.006096601486206
1 done 3.00590181350708


In [42]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog

path = os.getcwd()




def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df


def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver(binary=path+"/chromedriver")
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        body = await session.get_page_source()
        links = await get_links(body)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        return links


async def run(urls, timeout=60, start=None):
    results = []
    for i, url in enumerate(urls):
        results.append(
            asyncio.create_task(scraper(url, i=i, timeout=60, start=start))
        )
    list_of_links = await asyncio.gather(*results)
    return list_of_links

if __name__ == "__main__":
    set_arsenic_log_level()
    start = time.time()
    urls = ['https://www.spoonflower.com/en/shop?on=fabric', 
            'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith']
    name = "link.pkl"
    results = asyncio.run(run(urls, start=start))
    print(len(results))
    end = time.time() - start
    print(f'total time is {end}')
#     df = store_links_as_df_pickle(results, name=name)
#     print(df.head())


Overwriting async_scrape.py


In [43]:
!python async_scrape.py

Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 53573
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 57835
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
0 took 3.5485942363739014 seconds
1 took 4.997168064117432 seconds
2
total time is 5.101632356643677


## Extract Product Data

In [46]:
import re
import requests
from requests_html import HTML
import pandas as pd
from urllib.parse import urlparse
import time

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


def scraper(url):
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    time.sleep(5)
    return driver.page_source

def extract_id_slug(url_path):
    path = url_path
    if path.startswith('http'):
        parsed_url = urlparse(path)
        path = parsed_url.path
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, path)
    if not group:
        return None, None, path
    return group['id'], group['slug'], path

In [47]:
url = 'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith'

html_str = scraper(url)
content = HTML(html=html_str)

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [50]:
def get_product_data(url, content):
    id_, slug_, path = extract_id_slug(url)
    titleEl = content.find(".design-title", first=True)
    data = {
        'id': id_,
        'slug': slug_,
        'path': path,
    }
    title = None
    if titleEl == None:
        return data
    title = titleEl.text
    data['title'] = title
    sizeEl = content.find("#fabric-size", first=True)
    size = None
    if sizeEl != None:
        size = sizeEl.text
    data['size'] = size
    price_parent_el = content.find('.b-item-price', first=True)
    price_el = price_parent_el.find('.visuallyhidden', first=True)
    for i in price_el.element.iterchildren():
        attrs = dict(**i.attrib)
        attrs_keys = list(attrs.keys())
        data[i.attrib['itemprop']] = i.attrib[attrs_keys[0]]
    return data

In [51]:
get_product_data(url, content)

{'id': '6444170',
 'slug': 'catching-fireflies-by-thestorysmith',
 'path': '/en/fabric/6444170-catching-fireflies-by-thestorysmith',
 'title': 'Catching Fireflies',
 'size': None,
 'price': 'price',
 'priceCurrency': 'priceCurrency',
 'priceValidUntil': 'priceValidUntil'}

In [112]:
# url2 = "https://www.spoonflower.com/en/fabric/7175195-golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign?fabric=petal_signature_cotton"
# html_str2 = scraper(url2)
# content2 = HTML(html=html_str2)
# get_product_data(url2, content2)


{'id': '7175195',
 'slug': 'golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign',
 'path': '/en/fabric/7175195-golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign',
 'title': 'Golden Watercolor Rainbow Rows .5"'}

## Async Product Data Extraction

In [54]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib
from urllib.parse import urlparse

import logging
import structlog # pip install structlog

path = os.getcwd()


def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df


def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    path = url_path
    if path.startswith('http'):
        parsed_url = urlparse(path)
        path = parsed_url.path
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, path)
    if not group:
        return None, None, path
    return group['id'], group['slug'], path



async def get_product_data(url, content):
    id_, slug_, path = await extract_id_slug(url)
    titleEl = content.find(".design-title", first=True)
    data = {
        'id': id_,
        'slug': slug_,
        'path': path,
    }
    title = None
    if titleEl == None:
        return data
    title = titleEl.text
    data['title'] = title
    sizeEl = content.find("#fabric-size", first=True)
    size = None
    if sizeEl != None:
        size = sizeEl.text
    data['size'] = size
    price_parent_el = content.find('.b-item-price', first=True)
    price_el = price_parent_el.find('.visuallyhidden', first=True)
    for i in price_el.element.iterchildren():
        attrs = dict(**i.attrib)
        attrs_keys = list(attrs.keys())
        data[i.attrib['itemprop']] = i.attrib[attrs_keys[0]]
    return data

async def get_parsable_html(body_html_str):
    return HTML(html=body_html_str)

async def get_links(html_r):
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_, _ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver(binary=path+"/chromedriver")
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        await asyncio.sleep(10)
        body = await session.get_page_source() # save this locally??
        content = await get_parsable_html(body) 
        links = await get_links(content)
        product_data = await get_product_data(url, content)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        dataset = {
            "links": links,
            "product_data": product_data
        }
        return dataset


async def run(urls, timeout=60, start=None):
    results = []
    for i, url in enumerate(urls):
        results.append(
            asyncio.create_task(scraper(url, i=i, timeout=60, start=start))
        )
    list_of_links = await asyncio.gather(*results)
    return list_of_links

if __name__ == "__main__":
    set_arsenic_log_level()
    start = time.time()
    urls = ['https://www.spoonflower.com/en/shop?on=fabric', 
            'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith']
    name = "link.pkl"
    results = asyncio.run(run(urls, start=start))
    print(results)
    end = time.time() - start
    print(f'total time is {end}')
#     df = store_links_as_df_pickle(results, name=name)
#     print(df.head())


Overwriting async_scrape.py


In [55]:
!python async_scrape.py

Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 47343
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
Starting ChromeDriver 106.0.5249.61 (511755355844955cd3e264779baf0dd38212a4d0-refs/branch-heads/5249@{#569}) on port 48207
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
0 took 14.407435417175293 seconds
1 took 15.427944660186768 seconds
[{'links': [{'id': '11483815', 'slug': 'updated-large-scale-natural-habitat-bees-moths-wild-grass-flowers-please-read-description-by-ozdebayer', 'path': '/en/fabric/11483815-updated-large-scale-natural-habitat-bees-moths-wild-grass-flowers-please-read-description-by-ozdebayer', 'scraped': 0}, {'id': '7661