# Web Scraping con Python

Este script muestra algunas posibilidades de la biblioteca *Selenium* que nos permite obtener, automáticamente, datos de páginas web dinámicas, es decir que necesitan pulsar botones, etc, etc.

Selenium está pensado realmente para automatizar pruebas de entornos web, pero a nosotros nos servirá para nuestro propósito de hacer web scraping.

En particular, lo que vamos a hacer es arrancar google Chrome y manejarlo automáticamente desde Python.

Para ello necesitamos 3 cosas:

1.- Tener google Chrome

2.- Instalar la biblioteca selenium

3.- Tener un fichero controlador, para uso utilizaremos la librería chromedriver_autoinstaller



Empezamos asegurarnos de que Selenium está instalado

In [1]:
modules = ["selenium","chromedriver_autoinstaller"]


import sys
import os.path
from subprocess import check_call
import importlib
import os

def instala(modules):
    print("Instalando módulos")
    for m in modules:
        # para el import quitamos [...] y ==...
        p = m.find("[")
        mi = m if p==-1 else m[:p]
        p = mi.find("==")
        mi = mi if p==-1 else mi[:p]
        torch_loader = importlib.util.find_spec(mi)
        if torch_loader is not None:
            print(m," encontrado")
        else:
            print(m," No encontrado, instalando...",end="")  
            try:        
                r = check_call([sys.executable, "-m", "pip", "install", "--user",  m])
                print("¡hecho!")
            except:
                print("¡Problema al instalar ",m,"! ¿seguro que el módulo existe?",sep="")

    print("¡Terminado!")

instala(modules)  

Instalando módulos
selenium  encontrado
chromedriver_autoinstaller  encontrado
¡Terminado!



Ahora abrimos el navegador

In [2]:

import sys

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_autoinstaller

# setup chrome options
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') # ensure GUI is off
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()


# set up the webdriver
driver = webdriver.Chrome(options=chrome_options)

# Banco de Santander

In [4]:
url = 'https://www.investing.com/equities/banco-santander'
driver.get(url)



Simulamos un click en la página para aceptar las cookies


In [5]:
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

try:
    cookies = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    cookies.click()
    print("Cookies aceptadas.")
except Exception as e:
    print("No se encontró el botón de aceptar cookies o ocurrió un error:", e)


Cookies aceptadas.


In [6]:

try:
    elemento = driver.find_element(By.CSS_SELECTOR, '[data-test="instrument-price-last"]')
except Exception as e:
    print("No se encontró el botón de aceptar cookies o ocurrió un error:", e)


In [9]:
elemento.text

'3.8345'

Obtenemos el elemento que incluye toda la tabla

In [10]:
try:
    tabla = driver.find_element(By.CSS_SELECTOR, '[data-test="key-info"]')
except Exception as e:
    print("No se encontró el elemento tabla", e)

In [11]:
tabla.text

"Prev. Close\n3.8345\nOpen\n3.8305\nDay's Range\n3.7915-3.848\n52 wk Range\n2.942-3.9695\nVolume\n22,389,316\nAverage Vol. (3m)\n33,798,965\n1-Year Change\n8.93%\nShares Outstanding\n15,814,437,167\nFair Value\nUnlock\nDividends Payment\nStreak\nUnlock\nMarket Cap\n60.64B\nRevenue\n44.76B\nP/E Ratio\n5.90\nEPS\n0.65\nDividend (Yield)\n0.0656\n(1.71%)\nBeta\n1.34\nNext Earnings Date\nApr 30, 2024\nBook Value / Share\nUnlock\nEV/EBITDA\nUnlock\nISIN\nES0113900J37"

In [15]:
try:
    clase = "key-info_dd-numeric__ZQFIs"
    els = tabla.find_elements(By.CLASS_NAME, clase)
except Exception as e:
    print("No se encontró el elemento:", e)
        

In [16]:
els

[<selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7291")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7294")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7295")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7296")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7297")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e592b176b4d87fc5f", element="BFB1B80842EB6DD2CFD2922DF53DEA6D_element_7298")>,
 <selenium.webdriver.remote.webelement.WebElement (session="feb7c52af742044e

In [14]:
el = els[0]
print(el)

el=els[1]
print(el)

el=els[2]
print(el)

a
b
c


In [18]:
valores = []
for el in els:
    valores.append(el.text) # lo añadimos a la lista
len(valores),valores    

(17,
 ['3.8345',
  '3.8305',
  '3.7915',
  '3.848',
  '2.942',
  '3.9695',
  '22,389,316',
  '33,798,965',
  '8.93%',
  '15,814,437,167',
  '60.64B',
  '44.76B',
  '5.90',
  '0.65',
  '0.0656',
  '1.71%',
  '1.34'])

Ahora los nombres de los datos 

In [19]:
nombres = ['Prev. Close',
  'Open',
  "Day's Range 1",
  "Day's Range 2",
  '52 wk Range 1',
  '52 wk Range 2',
  'Volume',
  'Average Vol. (3m)',
  '1-Year Change',
  'Shares Outstanding',
  'Market Cap',
  'Revenue',
  'P/E Ratio',
  'EPS',
  'Dividend (Yield) 1',
  'Dividend (Yield) 2',
  'Beta']
    
import pandas as pd
df = pd.DataFrame([valores], columns=nombres)
df

Unnamed: 0,Prev. Close,Open,Day's Range 1,Day's Range 2,52 wk Range 1,52 wk Range 2,Volume,Average Vol. (3m),1-Year Change,Shares Outstanding,Market Cap,Revenue,P/E Ratio,EPS,Dividend (Yield) 1,Dividend (Yield) 2,Beta
0,3.8345,3.8305,3.7915,3.848,2.942,3.9695,22389316,33798965,8.93%,15814437167,60.64B,44.76B,5.9,0.65,0.0656,1.71%,1.34


In [20]:
df.to_excel("santander.xlsx", index=False)
driver.close()

### IBEX 


In [21]:

# setup chrome options
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') # ensure GUI is off
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()


# set up the webdriver
driver = webdriver.Chrome(options=chrome_options)

url = 'https://es.investing.com/indices/spain-35'
driver.get(url)
          

In [22]:

try:
    elemento = driver.find_element(By.CSS_SELECTOR, '[data-test="most-active-stocks-table"]')
    table = elemento.find_element(By.TAG_NAME, "table")
    print("Tabla seleccionada")
except Exception as e:
    print("No se encontró el elemento:", e)
            

Tabla seleccionada


Vamos fila a fila

In [23]:

try:
    filas = table.find_elements(By.TAG_NAME, "tr") # elements porque hay más de 1
    print(len(filas))
except Exception as e:
    print("No se encontró el elemento:", e)

6


Cabeceras

In [25]:
cabecera_els = filas[0].find_elements(By.TAG_NAME, "th")
cabecera = []
for cab in cabecera_els:
    cabecera.append(cab.text)
len(cabecera),cabecera    

(10,
 ['Nombre',
  'Último',
  'Anterior',
  'Máximo',
  'Mínimo',
  '% Var.',
  'Vol.',
  'Hora',
  '',
  ''])

Echamos un vistazo a las filas 1 a 1

In [26]:
tabla = []
for i,fila in enumerate(filas[1:]):
    print("Fila ",i,end=" ")
    fdatos = []
    try:
        cols = fila.find_elements(By.TAG_NAME, "td") # elements porque hay más de 1
        print("Columnas ",len(cols))
        for j,col in enumerate(cols):
            print("Columna ",j,col.text)
            fdatos.append(col.text)
    except Exception as e:
        print("No se encontró el elemento:", e)
    tabla.append(fdatos)
    print("="*100)

Fila  0 Columnas  9
Columna  0 SAN
Banco Santander S.A.
Columna  1 3,8345
Columna  2 38.345,00
Columna  3 3,848
Columna  4 3,7915
Columna  5 -0,12%
Columna  6 22,39M
Columna  7 
Columna  8 23/02
Fila  1 Columnas  9
Columna  0 SABE
Banco de Sabadell SA
Columna  1 1,1935
Columna  2 11.935,00
Columna  3 1,2005
Columna  4 1,1785
Columna  5 +0,25%
Columna  6 13,66M
Columna  7 
Columna  8 23/02
Fila  2 Columnas  9
Columna  0 ICAG
International Consolidated Airlines Group S.A.
Columna  1 1,784
Columna  2 1,784
Columna  3 1,794
Columna  4 1,75
Columna  5 -0,70%
Columna  6 9,37M
Columna  7 
Columna  8 23/02
Fila  3 Columnas  9
Columna  0 IBE
Iberdrola S.A.
Columna  1 10,63
Columna  2 10,63
Columna  3 10,735
Columna  4 10,57
Columna  5 -1,12%
Columna  6 9,1M
Columna  7 
Columna  8 23/02
Fila  4 Columnas  9
Columna  0 TEF
Telefónica S.A.
Columna  1 3,776
Columna  2 37.760,00
Columna  3 3,776
Columna  4 3,71
Columna  5 +0,88%
Columna  6 8,53M
Columna  7 
Columna  8 23/02


In [27]:
tabla

[['SAN\nBanco Santander S.A.',
  '3,8345',
  '38.345,00',
  '3,848',
  '3,7915',
  '-0,12%',
  '22,39M',
  '',
  '23/02'],
 ['SABE\nBanco de Sabadell SA',
  '1,1935',
  '11.935,00',
  '1,2005',
  '1,1785',
  '+0,25%',
  '13,66M',
  '',
  '23/02'],
 ['ICAG\nInternational Consolidated Airlines Group S.A.',
  '1,784',
  '1,784',
  '1,794',
  '1,75',
  '-0,70%',
  '9,37M',
  '',
  '23/02'],
 ['IBE\nIberdrola S.A.',
  '10,63',
  '10,63',
  '10,735',
  '10,57',
  '-1,12%',
  '9,1M',
  '',
  '23/02'],
 ['TEF\nTelefónica S.A.',
  '3,776',
  '37.760,00',
  '3,776',
  '3,71',
  '+0,88%',
  '8,53M',
  '',
  '23/02']]

In [28]:
columnas = ['Nombre',
  'Último',
  'Anterior',
  'Máximo',
  'Mínimo',
  '% Var.',
  'Vol.',
  'Abreviatura',          
  'Hora' ]

Convertimos en un dataframe

In [32]:
df = pd.DataFrame(tabla,columns = columnas)

df

Unnamed: 0,Nombre,Último,Anterior,Máximo,Mínimo,% Var.,Vol.,Abreviatura,Hora
0,SAN\nBanco Santander S.A.,38345,"38.345,00",3848,37915,"-0,12%","22,39M",,23/02
1,SABE\nBanco de Sabadell SA,11935,"11.935,00",12005,11785,"+0,25%","13,66M",,23/02
2,ICAG\nInternational Consolidated Airlines Grou...,1784,1784,1794,175,"-0,70%","9,37M",,23/02
3,IBE\nIberdrola S.A.,1063,1063,10735,1057,"-1,12%","9,1M",,23/02
4,TEF\nTelefónica S.A.,3776,"37.760,00",3776,371,"+0,88%","8,53M",,23/02


In [33]:
df[["Abreviatura","Nombre"]] = df['Nombre'].str.split('\n', expand=True)
df

Unnamed: 0,Nombre,Último,Anterior,Máximo,Mínimo,% Var.,Vol.,Abreviatura,Hora
0,Banco Santander S.A.,38345,"38.345,00",3848,37915,"-0,12%","22,39M",SAN,23/02
1,Banco de Sabadell SA,11935,"11.935,00",12005,11785,"+0,25%","13,66M",SABE,23/02
2,International Consolidated Airlines Group S.A.,1784,1784,1794,175,"-0,70%","9,37M",ICAG,23/02
3,Iberdrola S.A.,1063,1063,10735,1057,"-1,12%","9,1M",IBE,23/02
4,Telefónica S.A.,3776,"37.760,00",3776,371,"+0,88%","8,53M",TEF,23/02


In [34]:
df.to_excel("activos.xlsx",index=False)