In [65]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }a
</style>
"""))

# 7. Web Scraping

Web  scraping  is  the  practice  of  gathering  data  through  any  means  otherthan a program interacting with an API (or, obviously, through a human using a webbrowser).  This  is  most  commonly  accomplished  by  writing  an  automated  programthat queries a web server, requests data (usually in the form of the HTML and otherfiles  that  comprise  web  pages),  and  then  parses  that  data  to  extract  needed  information.

# 7.1 Selenium
Selenium automates browsers. That's it! <br>
Selenium is a Python library and tool used for automating web browsers to do a number of tasks. One of such is web-scraping to extract useful data and information that may be otherwise unavailable. <br>
**For this course, we use Chrome.**

## 7.1 Installing Libraries
We need to install these two libraries

In [66]:
# !pip install -r requirements.txt

## 7.2 Calling Libraries

In [67]:
# this library is to manipulate browser
from selenium import webdriver

# it allows you to work with differen versions of drivers
# We call ChromeDriver
# from webdriver_manager.chrome import ChromeDriverManager
import re
import time 
from selenium.webdriver.common.by import By

## 7.3 Launch/Set the Driver
This code opens a Chrome Driver. We are going to use it to go navigate on the web.

In [68]:
# driver = webdriver.Chrome()
driver = webdriver.Firefox()
url = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get(url)

In [69]:
driver.maximize_window()

## Chrome is being controlled by automated test software

![Chrome is controlled by automated software](Images/chrome_automated.png)

## Browser is under remote control (reason: Marionette)

![Browser is under remote control (reason: Marionette)](Images/firefox_automated.png)

In [70]:
# Access to the title
print('Title: ', driver.title)

Title:  Presentación de Resultados Elecciones Generales y Parlamento Andino 2021


In [71]:
# Access to the curent url 
print('Current Page URL: ', driver.current_url)

Current Page URL:  https://resultadoshistorico.onpe.gob.pe/EG2021/


In [72]:
# Make screenshot of the webpage
driver.save_screenshot('Images/resultados_presidenciales.png')

True

In [73]:
re.search(r'historico', driver.current_url)

<re.Match object; span=(18, 27), match='historico'>

In [74]:
if re.search(r'resultadoshistorico', driver.current_url):
    driver.save_screenshot('Images/resultados_presidenciales.png') #save screenshot with provided name
    print('Resultados Presidenciales saved!')

Resultados Presidenciales saved!


In [75]:
#get cookie information
cookies = driver.get_cookies()
print('Cookies obtained from resultados_presidenciales')
print(cookies)

Cookies obtained from resultados_presidenciales
[]


In [76]:
# Get page source
print(type(driver.page_source))
driver.page_source

<class 'str'>


'<html lang="en" class="hydrated"><head><style>\n    :root {\n      /* Typography */\n      --amplify-font-family: \'Amazon Ember\', \'Helvetica Neue\', \'Helvetica\', \'Arial\', sans-serif;\n\n      --amplify-text-xxs: 0.75rem;\n      --amplify-text-xs: 0.81rem;\n      --amplify-text-sm: 0.875rem;\n      --amplify-text-md: 1rem;\n      --amplify-text-md-sub: 1.15rem;\n      --amplify-text-lg: 1.5rem;\n      --amplify-text-xl: 2rem;\n      --amplify-text-xxl: 2.5rem;\n\n      /* Colors */\n      --amplify-primary-color: #ff9900;\n      --amplify-primary-contrast: var(--amplify-white);\n      --amplify-primary-tint: #ffac31;\n      --amplify-primary-shade: #e88b01;\n\n      --amplify-secondary-color: #152939;\n      --amplify-secondary-contrast: var(--amplify-white);\n      --amplify-secondary-tint: #31465f;\n      --amplify-secondary-shade: #1F2A37;\n\n      --amplify-tertiary-color: #5d8aff;\n      --amplify-tertiary-contrast: var(--amplify-white);\n      --amplify-tertiary-tint: #7da

In [77]:
# Refresh the page - 
driver.refresh() #reload or refresh the browser

In [78]:
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

time.sleep(5)

url_2 = "https://www.google.com/"
driver.get( url_2 )

time.sleep(3)
driver.back()

In [79]:
driver.close()

In [80]:
driver.quit()

![Quite and Close](Images/quite_close.png)

In [81]:
type(driver)

selenium.webdriver.firefox.webdriver.WebDriver

`driver` is an `selenium.webdriver.chrome.webdriver.WebDriver` object. This object has some attributes that will help us to navigate on the web.

Now, you can see in the driver that we are in [this link](https://resultadoshistorico.onpe.gob.pe/EG2021/).

# Best Practices before working

1. Maximize the browser

In [82]:
# driver = webdriver.Chrome( ChromeDriverManager().install() )

# url = 'https://www.kaspersky.com/resource-center/definitions/cookies'
# driver.get( url )

# driver.maximize_window()

In [83]:
driver = webdriver.Firefox()
url = 'https://www.kaspersky.com/resource-center/definitions/cookies'
driver.get( url )
driver.maximize_window()

2. Set the Browser Zoom Level to 100 percent

In [84]:
driver.execute_script("document.body.style.zoom='100%'")

3. Quit driver when done

In [85]:
driver.quit()

### 7.4.1. HTML
HTML stands for HyperText Markup Language. You can deduce that it’s a language for creating web pages. It’s not a programming language like Python or Java, but it’s a markup language. It describes the elements of a page through tags characterized by angle brackets.

1. The document always begins and ends using `<html>` and `</html>`.
2. `<body></body>` constitutes the visible part of HTML document.
3. `<h1>` to `<h3>` tags are defined for the headings.

#### 7.4.1.1. HTML Headings
HTML headings are defined with the `<h1>` to `<h6>` tags.
`<h1>` defines the most important heading. `<h6>` defines the least important heading.

We can use text cells since markdown reads html tags.

<h1>This is heading 1</h1>
<h2>This is heading 2</h2>
<h3>This is heading 3</h3>

#### 7.4.1.2. HTML Paragraphs
HTML paragraphs are defined with the `<p>` tag.
`<br>` tag is similar to `"\n"`.

<html>
<br>
<p>My first paragraph.</p> <br>
<p>This is another paragraph for this text cell.</p>
<html>

#### 7.4.1.3. HTML Links
HTML links are defined with the `<a>` tag:

<a href="http://bayes.cs.ucla.edu/jp_home.html">This is a link for Judea Pearl's Website</a>

#### 7.4.1.3. Unordered HTML List
An unordered list starts with the `<ul>` tag. Each list item starts with the `<li>` tag.

<ul>
  <li>Coffee</li>
  <li>Tea</li>
  <li>Milk</li>
</ul>

#### 7.4.1.4. Ordered HTML List
An ordered list starts with the `<ol>` tag. Each list item starts with the `<li>` tag.

<ol>
  <li>Coffee</li>
  <li>Tea</li>
  <li>Milk</li>
</ol>

#### 7.4.1.4. HTML Tables

A table in HTML consists of table cells inside rows and columns. Each table cell is defined by a `<td>` and a `</td>` tag. Each table row starts with a `<tr>` and end with a `</tr>` tag.

<table>
  <tr>
    <th>Manager</th>
    <th>Club</th>
    <th>Nationality</th>
  </tr>
  <tr>
    <td>Mikel Arteta</td>
    <td>Arsenal</td>
    <td>Spain</td>
  </tr>
  <tr>
    <td>Thomas Tuchel</td>
    <td>Chelsea</td>
    <td>Germany</td>
  </tr>
</table>

#### 7.4.1.5. HTML Iframes

An HTML iframe is used to display a web page within a web page.


<!DOCTYPE html>
<html>
  
<head>
    <title>HTML iframe src Attribute</title>
</head>
  
<body style="text-align: center">
    <h1>Diploma</h1>
    <h2>HTML iframe</h2>
    <iframe>
          
        <!DOCTYPE html>
        <html>

        <head>
            <title>New html</title>
        </head>

        <body style="text-align: center">
            <h1>Diploma2</h1>
            <h2>HTML iframe</h2>
            <iframe>

            </iframe>
        </body>

        </html>
    </iframe>
</body>
  
</html>

#### 7.4.1.6. HTML Tags - Key

|Tag|Description|
|---|---|
|`<h1>` to `<h6>`|	Defines HTML headings|
|`<ul>`|	Defines an unordered list|
|`<ol>`|	Defines an ordered list|
|`<p>`|	Defines a paragraph|
|`<a>`|	It is termed as anchor tag and it creates a hyperlink or link.|
|`<div>`|	It defines a division or section within HTML document.|
|`<strong>`|	It is used to define important text.|
|`<table>`|	It is used to present data in tabular form or to create a table within HTML document.|
|`<td>`|	It is used to define cells of an HTML table which contains table data|
|`<iframe>`|	Defines an inline frame|

### 7.4. Identifying elements in a web page

To identify elements of a webpage, we need to inspect the webpage. Open the browser and press `Ctrl`+ `Shift` + `I`.

#### One Element
|Method|Description|
|---|---|
|find_element( By.ID, ... | Use id.|
|find_element( By.NAME, ... | Use name.|
|find_element( By.XPATH, ... | Use Xpath.|
|find_element( By.TAG_NAME, ... | Use HTML tag.|
|find_element( By.CLASS_NAME, ... | Use class name.|
|find_element( By.CSS_SELECTOR, ...| Use css selector.|

#### Multiple  elements
|Method|Description|
|---|---|
|find_elements( By.ID, ... | Use id.|
|find_elements( By.NAME, ... | Use name.|
|find_elements( By.XPATH, ... | Use Xpath.|
|find_elements( By.TAG_NAME, ... | Use HTML tag.|
|find_elements( By.CLASS_NAME, ... | Use class name.|
|find_elements( By.CSS_SELECTOR, ...| Use css selector.|

### 7.4.1. Xpath
XPath in Selenium is an XML path used for navigation through the HTML structure of the page. It is a syntax or language for finding any element on a web page using XML path expression.

The basic format of XPath in selenium is explained below with screen shot.
<img src="../_images/x_path.png">

**DO NOT COMPLICATE!**
Finding the XPath of a element:
1. Go to the element
2. Right click
3. Inspect - You may have to do it twice.
4. Go to the selected line
5. Right click
7. Copy 
8. Copy Full Xpath

**Example**

We are going to click on the link to `Resumen General`. To do this, we must find the element with `find_element`, specify that we are using the XPath by setting the first option with `By.XPATH`, and click.

In [86]:
# driver = webdriver.Chrome( ChromeDriverManager().install() )
# driver.maximize_window()

# url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
# driver.get( url_1 )
# #time.sleep(3)

In [87]:
driver = webdriver.Firefox()

url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )
driver.maximize_window()
resumen_general = driver.find_element( By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img' )
resumen_general.click()

ElementNotInteractableException: Message: Element <img src="./assets/imagenes/resumen_general.jpg"> could not be scrolled into view
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:353:5
webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:167:11
interaction.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:136:11
clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:344:29
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:220:31


Did we get an error? This may be because we tried to find the element when the webpage had not finished loading yet. To prevent this, we can use the `sleep` function from Python's default `time` module. We can "sleep" our program for one second

In [88]:
driver.get( url_1 )
driver.maximize_window()
time.sleep(1)
resumen_general = driver.find_element( By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img' )
resumen_general.click()

We can also find elements by their ID. We can find an element's ID by once again inspecting the page. We must keep in mind that an ID may not always be available, as it must be manually set by the programmer of the webpage. In this case, the dropdown menus for filtering by geography do have IDs available.

In [89]:
driver.find_element( By.ID, 'select_ambito').click()

We can close the dropdown menu by clicking on it again

In [90]:
driver.find_element( By.ID, 'select_ambito').click()

Elements may also have names which we can use to find them.

In [91]:
driver.find_element( By.NAME, 'cod_ambito')

<selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="941720f0-d803-4031-9ed8-c009f334c9a2")>

The best practices regarding finding elements are as follows:

1. Always use the ID of the element when it is available
2. If the ID is not available, use the element's name
3. If neither is available, we should use the XPath

XPaths are always available, because they are constructed by referencing the HTML code itself.

Another way of finding elements are through the `class_name`. However, these classes tend to be used for several objects. Therefor, we should only use the class names when we want to find many objects of the same type

In [92]:
red_buttons = driver.find_elements( By.CLASS_NAME, 'btn_rojo')
red_buttons

[<selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="dc275188-8a76-4d9d-bfca-fdf4afa0e0ad")>,
 <selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="b97c9262-653e-470b-a200-aedcd5aa7be5")>,
 <selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="5eeea5d1-ff6d-4f3e-b085-33eb894ecbd3")>,
 <selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="9c8402c9-b59d-4278-992a-b466d8aa21a1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="08741249-56d1-41f7-93f6-15c727ffe2d1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="808de170-89a9-4584-949b-61e01666c768")>]

### How this works

The `find_element` method returns an object of class `WebElement`. In the case of `red_buttons`, the `find_elements` method returns several of these objects in a list.

The `WebElement` class has several attributes and methods. The following image lists some of the most useful ones.

![Web Element](Images/Web_Elementpng.png)

For example, we can see the text in the buttons by looking at the `text` attribute

In [93]:
for button in red_buttons:
        print(button.text)

Resumen General
Presidencial
Congresistas
Parlamento Andino
Actas
Participación Ciudadana


We can also look at the attributes by using the `get_attribute` method

In [94]:
searchBox = driver.find_element( By.ID, 'select_ambito' )
searchBox.get_attribute('value')

'T'

**Suggestion** <br>
We do not recomend to use `tag` at first time since most web pages use nested tags and it is difficult to define an element using HTML tag. However, it is great to find elements that is inside another located element in the web. Let's see the example.

# Scraping distric-level tables

In [95]:
# driver = webdriver.Chrome( ChromeDriverManager().install() )
# Maximize window
driver.maximize_window()

# go to the link
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

time.sleep(1)

resumen_general = driver.find_element( By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[2]/div/div/a/div[1]/img' )
resumen_general.click()

We would like to get the frequency tables for votes in the presidential elections. We can select the presidential elections tab by first getting the elements for the red buttons

In [96]:
red_buttons = driver.find_elements( By.CLASS_NAME, 'btn_rojo')

for index, button in enumerate(red_buttons):
        print(f"{index}:", button.text)

0: Resumen General
1: Presidencial
2: Congresistas
3: Parlamento Andino
4: Actas
5: Participación Ciudadana


And we click on the button that takes us to the presidential elections tab

In [97]:
presidential = red_buttons[1]
presidential.click()

We are only interested in votes from within Peru. Let's first look at the options from the dropdown menu

In [98]:
scopes = driver.find_element(By.ID, "select_ambito")

Inside the "Ámbitos" dropdown menu, we can search by tags to find the option elements

In [99]:
scope_options = scopes.find_elements(By.TAG_NAME, "option")
for index, scope in enumerate(scope_options):
        print(f"{index}:", scope.text)

0: TODOS
1: PERÚ
2: EXTRANJERO


Now we choose the option for Peru

In [100]:
scope_options[1].click()

## Presidential results at the national level

Notice that in the page we can see a table that summarizes the results for the presidential elections at the national level. Because the table is an HTML object, scrape it for its data using `pandas`. We first find the table element

In [101]:
table_element = driver.find_element(By.ID, "table1")

We can look at the HTML source for this element.

In [102]:
table_html = table_element.get_attribute("outerHTML")

For now this is not very useful. However, we can use `pandas`'s `read_html` function to parse this into a `DataFrame`

In [103]:
import pandas as pd
from io import StringIO

table_html = table_element.get_attribute("outerHTML")
table = pd.read_html(StringIO(table_html))
table

[              TOTAL DE VOTOS          TOTAL DE VOTOS.1  \
 0   ORGANIZACIONES POLÍTICAS  ORGANIZACIONES POLÍTICAS   
 1                        NaN                       NaN   
 2                        NaN                       NaN   
 3                        NaN                       NaN   
 4                        NaN                       NaN   
 5                        NaN                       NaN   
 6                        NaN                       NaN   
 7                        NaN                       NaN   
 8                        NaN                       NaN   
 9                        NaN                       NaN   
 10                       NaN                       NaN   
 11                       NaN                       NaN   
 12                       NaN                       NaN   
 13                       NaN                       NaN   
 14                       NaN                       NaN   
 15                       NaN                       NaN 

Pandas returns a list with one `DataFrame`, in this case.

In [104]:
table = table[0]
table

Unnamed: 0,TOTAL DE VOTOS,TOTAL DE VOTOS.1,TOTAL DE VOTOS.2,TOTAL DE VOTOS.3,TOTAL DE VOTOS.4,TOTAL DE VOTOS.5
0,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS
1,,,PARTIDO NACIONALISTA PERUANO,228955,1.608%,1.309%
2,,,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",64217,0.451%,0.367%
3,,,PARTIDO MORADO,319176,2.242%,1.825%
4,,,PERU PATRIA SEGURA,54341,0.382%,0.311%
5,,,VICTORIA NACIONAL,802957,5.639%,4.592%
6,,,ACCION POPULAR,1294681,9.092%,7.404%
7,,,AVANZA PAIS - PARTIDO DE INTEGRACION SOCIAL,1652682,11.607%,9.452%
8,,,PODEMOS PERU,808559,5.678%,4.624%
9,,,JUNTOS POR EL PERU,1111407,7.805%,6.356%


We should get rid of the rows and columns that do not contain information we are interested in

In [105]:
table = table.iloc[:19, 2:]
table

Unnamed: 0,TOTAL DE VOTOS.2,TOTAL DE VOTOS.3,TOTAL DE VOTOS.4,TOTAL DE VOTOS.5
0,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS
1,PARTIDO NACIONALISTA PERUANO,228955,1.608%,1.309%
2,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",64217,0.451%,0.367%
3,PARTIDO MORADO,319176,2.242%,1.825%
4,PERU PATRIA SEGURA,54341,0.382%,0.311%
5,VICTORIA NACIONAL,802957,5.639%,4.592%
6,ACCION POPULAR,1294681,9.092%,7.404%
7,AVANZA PAIS - PARTIDO DE INTEGRACION SOCIAL,1652682,11.607%,9.452%
8,PODEMOS PERU,808559,5.678%,4.624%
9,JUNTOS POR EL PERU,1111407,7.805%,6.356%


Finally, we fix the column names

In [106]:
table = table.rename(columns = dict(zip(table.columns, table.iloc[0, :])))
table = table[1:].reset_index(drop = True)
table

Unnamed: 0,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS
0,PARTIDO NACIONALISTA PERUANO,228955,1.608%,1.309%
1,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",64217,0.451%,0.367%
2,PARTIDO MORADO,319176,2.242%,1.825%
3,PERU PATRIA SEGURA,54341,0.382%,0.311%
4,VICTORIA NACIONAL,802957,5.639%,4.592%
5,ACCION POPULAR,1294681,9.092%,7.404%
6,AVANZA PAIS - PARTIDO DE INTEGRACION SOCIAL,1652682,11.607%,9.452%
7,PODEMOS PERU,808559,5.678%,4.624%
8,JUNTOS POR EL PERU,1111407,7.805%,6.356%
9,PARTIDO POPULAR CRISTIANO - PPC,282007,1.981%,1.613%


We can aggregate this whole procedure into a function for later

In [107]:
from selenium.webdriver.remote.webelement import WebElement

def parse_table_from_element(table_element: WebElement) -> pd.DataFrame:
        table_html = table_element.get_attribute("outerHTML")
        table = pd.read_html(StringIO(table_html))[0].iloc[:19, 2:]
        table = table.rename(columns = dict(zip(table.columns, table.iloc[0, :])))[1:]
        return table

### Get all elements from all options

Now we get ready to loop over all districts. We can list the region options as before. First, we select the dropdown menu

In [108]:
regions = driver.find_element( By.ID, 'select_departamento')
regions

<selenium.webdriver.remote.webelement.WebElement (session="11d68e01-1747-4ebb-ac90-f75c7367371e", element="7a56aff6-d311-4f8a-ba01-f738790a5230")>

In [109]:
print(regions.text)

--TODOS--
AMAZONAS
ANCASH
APURIMAC
AREQUIPA
AYACUCHO
CAJAMARCA
CALLAO
CUSCO
HUANCAVELICA
HUANUCO
ICA
JUNIN
LA LIBERTAD
LAMBAYEQUE
LIMA
LORETO
MADRE DE DIOS
MOQUEGUA
PASCO
PIURA
PUNO
SAN MARTIN
TACNA
TUMBES
UCAYALI


We can then work with each option individually

In [110]:
print(regions.find_elements(By.TAG_NAME,  "option")[1].text)
print(regions.find_elements(By.TAG_NAME,  "option")[2].text)
print(regions.find_elements(By.TAG_NAME,  "option")[3].text)
print(regions.find_elements(By.TAG_NAME,  "option")[4].text)


AMAZONAS
ANCASH
APURIMAC
AREQUIPA


In [111]:
num_dep =  len(regions.find_elements(By.TAG_NAME,  "option"))
num_dep

26

In [112]:
for index, region in enumerate(regions.find_elements(By.TAG_NAME, "option")):
        print(f"{index}:", region.text)

0: --TODOS--
1: AMAZONAS
2: ANCASH
3: APURIMAC
4: AREQUIPA
5: AYACUCHO
6: CAJAMARCA
7: CALLAO
8: CUSCO
9: HUANCAVELICA
10: HUANUCO
11: ICA
12: JUNIN
13: LA LIBERTAD
14: LAMBAYEQUE
15: LIMA
16: LORETO
17: MADRE DE DIOS
18: MOQUEGUA
19: PASCO
20: PIURA
21: PUNO
22: SAN MARTIN
23: TACNA
24: TUMBES
25: UCAYALI


We have to be careful since everytime we make a click, the XPath changes, and the elements we got before we clicked become invalid ("stale").

### Loop over all departments

We start by building the necessary functions to loop over each geographical subdivision and get the tables for every district. The final step of this algorithm is parse the table and add it to a collection of tables that we will later concatenate with `pandas.concat()`. We will ignore the "--SELECCIONE--" option, as it does not contain district-level information.

In [113]:
from selenium.webdriver.remote.webdriver import WebDriver

def scrape_district_table(driver: WebDriver, all_tables: list[pd.DataFrame], region_name: str, province_name: str, district_id: int):
        districts = driver.find_element(By.ID, "cod_dist")
        district = districts.find_elements(By.TAG_NAME, "option")[district_id]
        district_name = district.text

        if district_name == "-- SELECCIONE --":
                return
        
        district.click()
        table_element = driver.find_element(By.ID, "table1")
        table = parse_table_from_element(table_element)
        table["DEPARTAMENTO"] = region_name
        table["PROVINCIA"] = province_name
        table["DISTRITO"] = district_name
        all_tables.append(table)
        return




Now we make a function that will loop over all districts within each province. We must keep in mind that every time we click on a new distric, we must reload the list district options, as it becomes stale. We do this in the function for the district level. In the function for the province level, we only load the list of district options to get the exact number of districts we must loop over. We also ignore the "--TODOS--" option.

In [114]:
from tqdm.notebook import tqdm

def scrape_province_tables(driver: WebDriver, all_tables: list[pd.DataFrame], region_name: str, province_id: int):
        provinces = driver.find_element(By.ID, "cod_prov")
        province = provinces.find_elements(By.TAG_NAME, "option")[province_id]
        province_name = province.text

        if province_name == "--TODOS--":
                return
        
        province.click()
        districts = driver.find_element(By.ID, "cod_dist")
        district_names = [district.text for district in districts.find_elements(By.TAG_NAME, "option")]
        num_district_options = len(district_names)

        for district_id in (pb := tqdm(range(num_district_options), leave = False)):
                pb.set_description(district_names[district_id])
                scrape_district_table(driver, all_tables, region_name, province_name, district_id)
        
        return

Finally, we define the procedure at the region level, which is very similar to the procedure for provinces.

In [115]:
def scrape_region_tables(driver: WebDriver, all_tables: list[pd.DataFrame], region_id: int):
        regions = driver.find_element(By.ID, "select_departamento")
        region = regions.find_elements(By.TAG_NAME, "option")[region_id]
        region_name = region.text

        if region_name == "--TODOS--":
                return
        
        region.click()
        provinces = driver.find_element(By.ID, "cod_prov")
        province_names = [province.text for province in provinces.find_elements(By.TAG_NAME, "option")]
        num_province_options = len(province_names)

        for province_id in (pb := tqdm(range(num_province_options), leave = False)):
                pb.set_description(province_names[province_id])
                scrape_province_tables(driver, all_tables, region_name, province_id)

We can now apply our functions. Scraping all regions would take a long time, so we only do it for 5 of the options

In [None]:
all_tables = []

regions = driver.find_element(By.ID, "select_departamento")
region_names = [region.text for region in regions.find_elements(By.TAG_NAME, "option")]

for region_id in (pb := tqdm(range(5), leave = False)):
        pb.set_description(region_names[region_id]) 
        scrape_region_tables(driver, all_tables, region_id)

print("FINISHED!")

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [117]:
master = pd.concat(all_tables).reset_index(drop = True)
master

Unnamed: 0,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS,DEPARTAMENTO,PROVINCIA,DISTRITO
0,PARTIDO NACIONALISTA PERUANO,356,9.327%,6.389%,AMAZONAS,BAGUA,ARAMANGO
1,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",13,0.341%,0.233%,AMAZONAS,BAGUA,ARAMANGO
2,PARTIDO MORADO,41,1.074%,0.736%,AMAZONAS,BAGUA,ARAMANGO
3,PERU PATRIA SEGURA,4,0.105%,0.072%,AMAZONAS,BAGUA,ARAMANGO
4,VICTORIA NACIONAL,151,3.956%,2.710%,AMAZONAS,BAGUA,ARAMANGO
...,...,...,...,...,...,...,...
7969,RENACIMIENTO UNIDO NACIONAL,3,1.648%,1.154%,AREQUIPA,LA UNION,TORO
7970,PARTIDO DEMOCRATICO SOMOS PERU,5,2.747%,1.923%,AREQUIPA,LA UNION,TORO
7971,PARTIDO POLITICO NACIONAL PERU LIBRE,64,35.165%,24.615%,AREQUIPA,LA UNION,TORO
7972,DEMOCRACIA DIRECTA,0,0.000%,0.000%,AREQUIPA,LA UNION,TORO


In [118]:
master.to_csv("_data_results/presidential_election_results.csv", index = False)

The following is a version of the code that achieves the same thing but without aggregating the nested loops into functios. For the sake of having clean and understandable code, it is advised to aggregate nested loops into functions to get clearer picture of the steps our algorithms follow. A useful rule of thumb is "each function should do one thing," derived from GNU's principle "each program should do one thing and do it well." This is only a guideline and can be broken in some circumstances, but it is very helpful to keep it in mind when working with complex algorithms with nested control sequences.

In [119]:
# Store all_tables
all_tables = []

driver.get(url_1)

time.sleep(1)

resumen_general = driver.find_element( By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[2]/div/div/a/div[1]/img' )
resumen_general.click()

driver.find_elements( By.CLASS_NAME, 'btn_rojo')[1].click()

opt_peru = driver.find_element( By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div/select/option[2]')
opt_peru.click()

dept_0 = driver.find_element(By.ID, "select_departamento")
num_regions = len(dept_0.find_elements( By.TAG_NAME, "option"))

num_regions

26

In [120]:
from selenium.webdriver.support.ui import WebDriverWait

all_tables = []

for region_id in range(3):
        opt_peru = driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div/select/option[2]')
        opt_peru.click()
        dept_select = driver.find_element(By.ID, "select_departamento")
        region = dept_select.find_elements( By.TAG_NAME, "option")[region_id]
        dpt_name = region.text

        if dpt_name != "--TODOS--":
                region.click()
                provinces = driver.find_element(By.ID, "cod_prov")
                num_province_options = len(provinces.find_elements(By.TAG_NAME, "option"))
                
                for province_id in range(num_province_options):
                        provinces = driver.find_element(By.ID, "cod_prov")
                        province = provinces.find_elements(By.TAG_NAME, "option")[province_id]
                        province_name = province.text
                        
                        if province_name != "--TODOS--":
                                province.click()
                                districts = driver.find_element(By.ID, "cod_dist")
                                num_district_options = len(districts.find_elements(By.TAG_NAME, "option"))

                                for district_id in range(num_district_options):
                                        districts = driver.find_element(By.ID, "cod_dist")
                                        district = districts.find_elements(By.TAG_NAME, "option")[district_id]
                                        district_name = district.text
                                        
                                        if district_name != "-- SELECCIONE --":
                                                district.click()
                                                table_element = driver.find_element(By.ID, "table1")
                                                table = parse_table_from_element(table_element)
                                                table["DEPARTAMENTO"] = dpt_name
                                                table["PROVINCIA"] = province_name
                                                table["DISTRITO"] = district_name
                                                all_tables.append(table)
                

In [121]:
pd.concat(all_tables)

Unnamed: 0,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS,DEPARTAMENTO,PROVINCIA,DISTRITO
1,PARTIDO NACIONALISTA PERUANO,356,9.327%,6.389%,AMAZONAS,BAGUA,ARAMANGO
2,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",13,0.341%,0.233%,AMAZONAS,BAGUA,ARAMANGO
3,PARTIDO MORADO,41,1.074%,0.736%,AMAZONAS,BAGUA,ARAMANGO
4,PERU PATRIA SEGURA,4,0.105%,0.072%,AMAZONAS,BAGUA,ARAMANGO
5,VICTORIA NACIONAL,151,3.956%,2.710%,AMAZONAS,BAGUA,ARAMANGO
...,...,...,...,...,...,...,...
14,RENACIMIENTO UNIDO NACIONAL,59,0.687%,0.489%,ANCASH,YUNGAY,YUNGAY
15,PARTIDO DEMOCRATICO SOMOS PERU,114,1.328%,0.946%,ANCASH,YUNGAY,YUNGAY
16,PARTIDO POLITICO NACIONAL PERU LIBRE,3696,43.062%,30.659%,ANCASH,YUNGAY,YUNGAY
17,DEMOCRACIA DIRECTA,16,0.186%,0.133%,ANCASH,YUNGAY,YUNGAY


In [122]:
final_data = pd.concat( all_tables).reset_index( drop = True )

In [123]:
final_data.to_excel( '_data_results/presidential_election_results.xlsx' , index = False )

Now that we are all done, we quit the driver

In [124]:
driver.quit()