# Scraping data generated by JavaScript

In [55]:
# When coding in Jupyter and Spyder, we need to use the class AsyncHTMLSession to make JavaScript work
# In other environments you can use the normal HTMLSession
from requests_html import AsyncHTMLSession

In [56]:
# establish a new asynchronous session
session = AsyncHTMLSession()

# The only difference we will experience between the regular HTML Session and the asynchronous one,
# is the need to write the keyword 'await' in front of some statements

In [57]:
# In this example we're going to use Nike's homepage: https://www.nike.com/
# Several of the links on this page, as well as other elements, are generated by JavaScript
# We will compare the result of scraping those before and after running the JavaScript code

In [58]:
# Since we used async session, we need to use the keyword 'await'
# If you use the regular HTMLSession, there is no need for 'await'
r = await session.get("https://www.nike.com/")
r.status_code

200

In [59]:
# So far, nothing different from our previous example has happened
# The JavaScript code has not yet been executed

In [60]:
# Here are some tags obtained before rendering the JavaScript code, i.e. extarcted from the raw HTML
divs = r.html.find("div")
p = r.html.find("p")
list_items = r.html.find("li")
links = r.html.find("a")
urls = r.html.absolute_links

In [61]:
# Now, we need to execute the JavaScript code that will generate additional tags

In [62]:
# The requests-html package provides a very simple interface for that - just use the 'render()' method
# ('arender()' when using async session)
# It runs the JavaScript code which updates the HTML. This may take a bit
# The updated HTML is stored in the old variable 'r.html' - you do not need to assign a new variable to the method
# As before, the 'await' keyword is supplied only because of the Async session
await r.html.arender()

In [63]:
# NOTE: The first time you run 'a/render()' Chromium will be downloaded and installed on your computer

In [64]:
# Now the HTML is updated and we can search for the same tags again
new_divs = r.html.find("div")
new_p = r.html.find("p")
new_li = r.html.find("li")
new_links = r.html.find("a")
new_urls = r.html.absolute_links

In [65]:
# We can see the difference in the number of found elements before and after the JavaScript executed

In [66]:
len(divs), len(new_divs)

(478, 571)

In [67]:
len(p), len(new_p)

(125, 129)

In [68]:
len(list_items), len(new_li)

(114, 410)

In [69]:
len(links), len(new_links)

(550, 822)

In [70]:
len(urls), len(new_urls)

(469, 480)

In [71]:
# Remember that 'urls' is a set, and not a list?
# Well, there is a useful feature of sets that we will now take advantage of
# It takes two sets and selects only those items from the first set that are not present in the second one

In [72]:
# Take only the new items in the first set
new_urls.difference(urls)

{'http://investors.nikeinc.com',
 'https://secure-store.nike.com/US/checkout/mobile/cart.jsp?l=cart&country=US&lang_locale=&site=nikestore&returnURL=https://www.nike.com/bg/',
 'https://www.nike.com/at/en/w/new-womens-nike-by-you-3n82yz5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-basketball-3glsmz5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-football-1gdj0z5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-lifestyle-13jrmz5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-running-37v7jz5e1x6z6ealh',
 'https://www.nike.com/at/en/w/womens-nike-by-you-training-gym-58jtoz5e1x6z6ealh',
 'https://www.nike.com/bg/email-signup',
 'https://www.nike.com/bg/en_gb/c/sustainability',
 'https://www.nike.com/bg/help/a/size-charts-gs',
 'https://www.nike.com/orders/details/'}

In [75]:
# Finally, close the session
session.close()

<coroutine object AsyncHTMLSession.close at 0x00000241AE2E46C8>

In [76]:
# You can check the documentation directly inside Jupyter
print(r.html.render.__doc__)

Reloads the response in Chromium, and replaces HTML content
        with an updated version, with JavaScript executed.

        :param retries: The number of times to retry loading the page in Chromium.
        :param script: JavaScript to execute upon page load (optional).
        :param wait: The number of seconds to wait before loading the page, preventing timeouts (optional).
        :param scrolldown: Integer, if provided, of how many times to page down.
        :param sleep: Integer, if provided, of how many long to sleep after initial render.
        :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
        :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``.

        If ``scrolldown`` is specified, the page will scrolldown the specified
        number of times, after sleeping the specified amount of time
        (e.g. ``scrolldown=10, sleep=1``).

        If just ``sleep