# Exploring the requests-html capabilities

In [None]:
# Official documentation of the requests-html package: https://requests-html.readthedocs.io/en/latest/

### Initial setup

In [1]:
# Loading the necessary packages
from requests_html import HTMLSession

In [2]:
# establish/open a session
session = HTMLSession()

In [3]:
# submitting a GET request
r = session.get("https://en.wikipedia.org/wiki/Association_football")
r.status_code

200

In [4]:
# The html response to the GET request is contained in the '.html' method
r.html

<HTML url='https://en.wikipedia.org/wiki/Association_football'>

### Links

In [5]:
# We can extract all link addresses directly with '.links'
urls = r.html.links
urls

{'/wiki/Harrow_School',
 '/wiki/UEFA_European_Under-21_Championship',
 '/wiki/Referee_(association_football)',
 '/wiki/Waboba',
 '/wiki/Roller_in-line_hockey',
 'https://fr.wikipedia.org/wiki/Football',
 'https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute',
 '/wiki/Tejo_(sport)',
 'https://www.fifa.com/flash/lotg/football/en/Laws1_04.htm',
 '/wiki/1960_in_association_football',
 '/wiki/Rugby_sevens',
 '/wiki/Medieval_football',
 '/wiki/Wikipedia:General_disclaimer',
 '/wiki/OCLC_(identifier)',
 'https://uz.wikipedia.org/wiki/Futbol',
 '/wiki/Algarve_Cup',
 '/wiki/1931_in_association_football',
 '/wiki/English_Football_League',
 'https://web.archive.org/web/20071011084145/http://fifa.com/flash/lotg/football/en/Laws1_03.htm',
 '/wiki/CONCACAF_Under-20_Championship',
 '/wiki/Camogie',
 '/wiki/File:Soccer_ball.svg',
 '/wiki/Doping_in_sport',
 '/wiki/Soccer_(disambiguation)',
 '/wiki/CONMEBOL',
 '/wiki/AFC_U-20_Asian_Cup',
 '/wiki/1966_in_association_football',
 '/wiki/1900

In [6]:
# Note that those are the relative URLs 

In [7]:
# To get absolute URLs we can use '.absolute_links' instead of '.links'
full_path_urls = r.html.absolute_links
full_path_urls

{'https://en.wikipedia.org/wiki/Snow_rugby',
 'https://en.wikipedia.org/wiki/Free_kick_(association_football)',
 'https://en.wikipedia.org/wiki/Seven-a-side_football',
 'https://en.wikipedia.org/wiki/Volata',
 'https://en.wikipedia.org/wiki/2012_in_association_football',
 'https://en.wikipedia.org/wiki/Football_at_the_African_Games',
 'https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute',
 'https://fr.wikipedia.org/wiki/Football',
 'https://en.wikipedia.org/wiki/WAFU_Zone_A_Women%27s_Cup',
 'https://www.fifa.com/flash/lotg/football/en/Laws1_04.htm',
 'https://en.wikipedia.org/wiki/Portal:Association_football',
 'https://uz.wikipedia.org/wiki/Futbol',
 'https://web.archive.org/web/20071011084145/http://fifa.com/flash/lotg/football/en/Laws1_03.htm',
 'https://en.wikipedia.org/wiki/Ballon_au_poing',
 'https://en.wikipedia.org/wiki/Oldest_football_competitions',
 'https://en.wikipedia.org/wiki/Sport_climbing',
 'https://en.wikipedia.org/wiki/FIFA_World_Player_of_the_Year',


In [8]:
# An important thing to note is that these links (given by both methods) are returned in a SET, not a LIST
type(urls)

set

## Searching for elements

In [9]:
# A quick note: requests-html uses CSS selectors for searching
# We will cover them in the next section,
# but here is a more thorough look into it: https://www.w3schools.com/cssref/css_selectors.asp

In [10]:
# We can search for elements similarly to Beautiful Soup using the find() method
# It behaves as find_all()

# find all 'a' tags
links = r.html.find("a")
links

[<Element 'a' id='top'>,
 <Element 'a' href='/wiki/Wikipedia:Featured_articles' title='This is a featured article. Click here for more information.'>,
 <Element 'a' href='/wiki/Wikipedia:Protection_policy#semi' title='This article is semi-protected.'>,
 <Element 'a' href='/wiki/File:Football_(soccer)_Part_One.ogg' title='Listen to this article'>,
 <Element 'a' class=('mw-jump-link',) href='#mw-head'>,
 <Element 'a' class=('mw-jump-link',) href='#searchInput'>,
 <Element 'a' href='/wiki/Soccer_Team_(band)' title='Soccer Team (band)'>,
 <Element 'a' href='/wiki/Soccer_(disambiguation)' class=('mw-disambig',) title='Soccer (disambiguation)'>,
 <Element 'a' href='/wiki/Football' title='Football'>,
 <Element 'a' href='/wiki/File:Football_iu_1996.jpg' class=('image',)>,
 <Element 'a' href='/wiki/Goal_(sport)' class=('mw-redirect',) title='Goal (sport)'>,
 <Element 'a' href='/wiki/Sports_governing_body' title='Sports governing body'>,
 <Element 'a' href='/wiki/FIFA' title='FIFA'>,
 <Element '

In [11]:
links[4]

<Element 'a' class=('mw-jump-link',) href='#mw-head'>

In [12]:
# To get the raw HTML of an element use the '.html' method
links[4].html

'<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>'

In [13]:
type(links[4].html)

str

In [14]:
# To extract the text inside an element, use ".text", just like in Beautiful Soup
links[4].text

'Jump to navigation'

In [15]:
# To obtain a dictionary of the element's attributes, use '.attrs' (exactly as in Beautiful Soup)
links[10].attrs

{'href': '/wiki/Goal_(sport)',
 'class': ('mw-redirect',),
 'title': 'Goal (sport)'}

In [16]:
# This package offers a couple of ways to filter tags based off text

# Choose only those tags that contain the string 'wikipedia' in their text (not in the 'href' attribute)
# Note: this is not case-sensitive
r.html.find("a", containing = "wikipedia")

[<Element 'a' href='//en.wikipedia.org/wiki/Wikipedia:Contact_us'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Wikipedia:About'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Learn about Wikipedia and how it works'>,
 <Element 'a' href='/wiki/Category:Wikipedia_indefinitely_semi-protected_pages' title='Category:Wikipedia indefinitely semi-protected pages'>,
 <Element 'a' dir='ltr' href='https://en.wikipedia.org/w/index.php?title=Association_football&oldid=1055852229'>]

In [17]:
# display the text of those tags
[tag.text for tag in r.html.find("a", containing = "wikipedia")]

['Contact Wikipedia',
 'About Wikipedia',
 'About Wikipedia',
 'Wikipedia indefinitely semi-protected pages',
 'https://en.wikipedia.org/w/index.php?title=Association_football&oldid=1055852229']

In [18]:
# If we wish to find only the first element (similarly to Beautiful Soup .find()) we need to specify the 'first' parameter
r.html.find("p", first = True)

<Element 'p' class=('mw-empty-elt',)>

### Searching for text

In [19]:
# The package also offers searching text based on the parse library
# The search() method can be thought of as the opposite of str.format():
# it finds the text instead of inserting it in the specified place

# For further details see https://pypi.org/project/parse/ 
# and https://docs.python.org/3/library/string.html#format-string-syntax

In [20]:
# The method searches for a matching string, where '{}' is replaced by the returned text
r.html.search("known{}soccer")

<Result (' as football field, football ground, ',) {}>

In [21]:
# To access the text, get the first element (index 0)
r.html.search("known{}soccer")[0]

' as football field, football ground, '

In [22]:
# search() finds only the shortest matching string
# To search for all matching strings use search_all()
r.html.search_all("known{}soccer")

[<Result (' as football field, football ground, ',) {}>,
 <Result (' as simply <b>football</b> or <b>',) {}>,
 <Result (' as the <a href="/wiki/Laws_of_the_Game_(association_football)" title="Laws of the Game (association football)">Laws of the Game</a>. The ball is 68–70&#160;cm (27–28&#160;in) in circumference and known as the <a href="/wiki/Ball_(association_football)" title="Ball (association football)">football</a>. The two teams compete to get the ball into the other team\'s goal (between the posts and under the bar), thereby scoring a goal. Players are not allowed to touch the ball with hands or arms while it is in play, except for the <a href="/wiki/Goalkeeper_(association_football)" title="Goalkeeper (association football)">goalkeepers</a> within the <a href="/wiki/Penalty_area" title="Penalty area">penalty area</a>. Players may use any other part of their body to strike or pass the ball, and mainly use their feet. The team that scores more goals at the end of the game is the 

In [23]:
len(r.html.search_all("known{}soccer"))

23

In [24]:
# Further details at:
# -- https://pypi.org/project/parse/
# -- https://docs.python.org/3/library/string.html#format-string-syntax