# NOTEBOOK WEB SCRAPING
### In this notebook you can find the web scrapping from the web page 'compara online'. At last, you'll have a credit cards' dataset.
### The credit cards in that Dataset are from Colombia

In [9]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.5.0 (from urllib3[socks]~=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.30.0->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1

#### At the first time, I tried to perform the scraping with BeautifulSoup but the webpage is dynamically generated with JavaScript code. So in this case 
#### the best option was Selenium because in this kind of websites the HTML structure is built after the page loads and interacts with scripts, which makes it hard for static parsers like BeautifulSoup to capture the actual content 

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [7]:
url = 'https://www.comparaonline.com.co/tarjeta-de-credito?tipo-de-tarjeta=nacional'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115 Safari/537.36'
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'lxml')

# Vamos a imprimir un resumen del HTML para buscar manualmente desde código
print(soup.prettify()[:5000])

<!DOCTYPE html>
<html dir="ltr" lang="es">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <meta content="#fff" name="theme-color"/>
  <script type="application/ld+json">
   {"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Inicio","item":"https://www.comparaonline.com.co/"},{"@type":"ListItem","position":2,"name":"Tarjeta de Crédito","item":"https://www.comparaonline.com.co/tarjeta-de-credito"}]}
  </script>
  <title>
   Tarjeta de Crédito | ¡Solicítala online! - ComparaOnline
  </title>
  <link data-testid="canonical" href="https://www.comparaonline.com.co/tarjeta-de-credito?tipo-de-tarjeta=nacional" rel="canonical"/>
  <meta content="¿Quieres solicitar tu Tarjeta de Crédito en línea? Selecciona entre Visa, Mastercard o American Express y pídela en pocos minutos con la compañía que desees." data-testid="description" name="description"/>
  <

## Libraries Selenium

In [67]:
# Import the required libraries for web scraping with Selenium
# Selenium WebDriver: main interface to automate browser actions
from selenium import webdriver
# Service: allows managing the ChromeDriver executable
from selenium.webdriver.chrome.service import Service
# By: provides different locating strategies (id, class, xpath, etc.)
from selenium.webdriver.common.by import By
# Options: to configure browser options (headless mode, disable notifications, etc.)
from selenium.webdriver.chrome.options import Options
# WebDriverWait & expected_conditions: help to wait until certain elements
# or conditions are met before proceeding (avoid errors due to page load delays)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Time: to add explicit waits (e.g., sleep), although WebDriverWait is usually preferred
import time

# Scraping national credit cards

In [124]:
# Path to the ChromeDriver executable (must match your Chrome version)
ruta_driver = r"C:\Users\santi\chromedriver\chromedriver-win64\chromedriver-win64\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")

# Create a Service object using the driver path
service = Service(executable_path=ruta_driver)
# Initialize the Chrome WebDriver with the service and options
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the target URL (credit cards page in ComparaOnline - Colombia)
driver.get("https://www.comparaonline.com.co/tarjeta-de-credito?tipo-de-tarjeta=nacional")

# Wait up to 15 seconds until all card elements are present in the DOM
WebDriverWait(driver, 15).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8"))
)

# Extra sleep to ensure that all dynamic content is fully loaded
time.sleep(5)



### Loop for scraping first credit cards 

In [125]:
# Find all the card containers on the page using CSS selector
tarjetas = driver.find_elements(By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8")
# List to store information about all credit cards
lista_info_tarjetas = []

for tarjeta in tarjetas:
    try:
        # Try to extract the card name (from the <a> tag)
        nombre_tarjeta = tarjeta.find_element(By.TAG_NAME, "a").text
    except:
        # If not found, assign a default value
        nombre_tarjeta = "nombre no encontrado"

    # Print card name in console
    print(f"\n {nombre_tarjeta}")

    # Find the <ul> element that contains the attributes of the card
    ul_elements = tarjeta.find_element(By.TAG_NAME, 'ul')
    atributos = ul_elements.find_elements(By.TAG_NAME, "li")

    # Initialize dictionary with the card name
    tarjetas_dict = {'nombre_tarjeta':nombre_tarjeta}

    # Loop through each attribute inside the list
    for atributo in atributos:
        try:
            # Extract attribute name and value (split by ':')
            nombre = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[0]
            valor = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[1]
            #valor = atributo.find_element(By.CSS_SELECTOR, 'span[data-testid="attribute-value"]').text.strip()

            # Add attribute to dictionary
            tarjetas_dict[nombre] = valor

            # Print attribute for debugging
            print(f'{nombre}:{valor}')
        except:
            # If parsing fails, skip that attribute
            continue

    # Append card dictionary to the list
    lista_info_tarjetas.append(tarjetas_dict)



    


 Tarjeta de Crédito Banco de Bogotá Mastercard Aliada
Tasa de interés mensual: 2.31%
Ingreso mínimo: 1 y 1.5 SMMLV
Cuota de manejo: $26.010
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $5.560
Tipo de tarjeta: Nacional

 Tarjeta de Crédito Banco de Bogotá Visa Aliada
Tasa de interés mensual: 2.31%
Ingreso mínimo: 1 y 1.5 SMMLV
Cuota de manejo: $26.010
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $5.560
Tipo de tarjeta: Nacional

 Tarjeta de Crédito AV Villas Portal 80 Clásica
Tasa de interés mensual: 2,29% M.V
Ingreso mínimo: 1 SMMLV.000
Cuota de manejo: $26,990 M.V
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $14,900

 Tarjeta de Crédito AV Villas Auteco Clásica
Tasa de interés mensual: 2.31%
Ingreso mínimo: 1 SMMLV
Cuota de manejo: $26,990 M.V
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $14,900

 Tarjeta de Crédito AV Villas Mastercard Facilísima
Tasa de interés mensual: 2,29% M.V
Ingreso mínimo: No informado
Cuota 

In [135]:
# Convert the list of dictionaries into a pandas DataFrame
# Each row corresponds to one credit card and its attributes
df_tarjetas = pd.DataFrame(lista_info_tarjetas)
# Display the DataFrame to check the collected data
df_tarjetas

Unnamed: 0,nombre_tarjeta,Tasa de interés mensual,Ingreso mínimo,Cuota de manejo,Marca,Solicitar,Costo de avance en efectivo,Tipo de tarjeta
0,Tarjeta de Crédito Banco de Bogotá Mastercard ...,2.31%,1 y 1.5 SMMLV,$26.010,Mastercard,Online,$5.560,Nacional
1,Tarjeta de Crédito Banco de Bogotá Visa Aliada,2.31%,1 y 1.5 SMMLV,$26.010,Visa,Online,$5.560,Nacional
2,Tarjeta de Crédito AV Villas Portal 80 Clásica,"2,29% M.V",1 SMMLV.000,"$26,990 M.V",Visa,Online,"$14,900",
3,Tarjeta de Crédito AV Villas Auteco Clásica,2.31%,1 SMMLV,"$26,990 M.V",Mastercard,Online,"$14,900",
4,Tarjeta de Crédito AV Villas Mastercard Facilí...,"2,29% M.V",No informado,"$28,990 M.V",Mastercard,Online,"$14,900",
5,Tarjeta de Crédito AV Villas Portal 80 Gold,"2,29% M.V",No informado,"$27,990 M.V",Visa,Online,"$14,900",
6,Tarjeta de Crédito AV Villas Plaza de Las Amér...,"2,29% M.V",No informado,"$27,990 M.V",Mastercard,Online,"$14,900",
7,Tarjeta de Crédito AV Villas Plaza de Las Amér...,"2,29% M.V",No informado,"$26,990 M.V",Mastercard,Online,"$14,900",
8,Tarjeta de Crédito AV Villas Auteco Gold,"2,29% M.V",No informado,"$27,990 M.V",Mastercard,Online,"$14,900",
9,Tarjeta de Crédito Banco Popular Diamante para...,"31,5100% E.A",No informado,$27.400,Visa,Sí,$ 0,Nacional


In [None]:
lista_info_tarjetas

## In this point, I realized that the amount of credit cards wasn't enough. With this amount of credit cards I don't think that the clustering model couldn't split the clusters, so I decided to scrap all the credit card from this website, include the international ones 

# Scraping all credit cards from the page 1 

In [121]:
# Path to the ChromeDriver executable (same as before)
ruta_driver_inter = r"C:\Users\santi\chromedriver\chromedriver-win64\chromedriver-win64\chromedriver.exe"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")

# Create a Service object for this new scraping session
service_inter = Service(executable_path=ruta_driver_inter)
# Initialize a new Chrome WebDriver instance
driver_inter = webdriver.Chrome(service=service, options=chrome_options)

# Open the general credit cards page (this time including international cards)
driver_inter.get("https://www.comparaonline.com.co/tarjeta-de-credito")

# Wait until all card containers are loaded in the DOM (max 15s)
WebDriverWait(driver_inter, 15).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8"))
)

# Extra delay to ensure all dynamic content has finished loading
time.sleep(5)


## Loop for scraping all credit cards

In [144]:
# Get all the credit card containers from the page
tarjetas_all = driver_inter.find_elements(By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8")
# Initialize an empty list to store info about each card
lista_info_tarjetas_inter = []

# Loop through each card container
for tarjeta in tarjetas_all:
    try:
        # Extract the card name from the <a> tag
        nombre_tarjeta = tarjeta.find_element(By.TAG_NAME, "a").text
    except:
        nombre_tarjeta = "nombre no encontrado"

    print(f"\n {nombre_tarjeta}")

    # Find the <ul> containing the card attributes
    ul_elements = tarjeta.find_element(By.TAG_NAME, 'ul')
    atributos = ul_elements.find_elements(By.TAG_NAME, "li")

    # Create a dictionary to store the card's information
    tarjetas_dict_inter = {'nombre_tarjeta':nombre_tarjeta}

    # Loop through all attributes (li elements inside ul)
    for atributo in atributos:
        try:
            # Extract attribute name and value (split by “:”)
            nombre = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[0]
            valor = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[1]
            #valor = atributo.find_element(By.CSS_SELECTOR, 'span[data-testid="attribute-value"]').text.strip()

            # Add to the dictionary
            tarjetas_dict_inter[nombre] = valor
            print(f'{nombre}:{valor}')
        except:
            continue # Skip attributes that don't follow the expected format

    # Append the dictionary of this card to the list
    lista_info_tarjetas_inter.append(tarjetas_dict_inter)





 Tarjeta de Crédito Bancolombia Mastercard Joven
Tasa de interés mensual: 2.01%
Ingreso mínimo: $200.000
Cuota de manejo: $15.490
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $6.900
Tipo de tarjeta: Internacional

 Tarjeta de Crédito Banco de Occidente Mastercard Joven
Tasa de interés mensual: No informado
Ingreso mínimo: $500.000
Cuota de manejo: $ 16.300
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $6,800
Tipo de tarjeta: Internacional

 Tarjeta de Crédito Banco de Occidente Visa Joven
Tasa de interés mensual: No informado
Ingreso mínimo: $500.000
Cuota de manejo: $ 16.300
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $6,800
Tipo de tarjeta: Internacional

 Tarjeta de Crédito Davivienda Visa Liviana
Tasa de interés mensual: 2,16%
Ingreso mínimo: 1 SMLV
Cuota de manejo: $20.000
Marca: Visa
Solicitar: Presencial
Tipo de tarjeta: Internacional

 Tarjeta de Crédito Davivienda Diners Club Liviana
Tasa de interés mensual: 2,16%
Ingreso 

In [145]:
# Convert the list of dictionaries into a pandas DataFrame
df_tarjetas_inter= pd.DataFrame(lista_info_tarjetas_inter)
# Display the DataFrame with all the scraped credit cards
df_tarjetas_inter

Unnamed: 0,nombre_tarjeta,Tasa de interés mensual,Ingreso mínimo,Cuota de manejo,Marca,Solicitar,Costo de avance en efectivo,Tipo de tarjeta
0,Tarjeta de Crédito Bancolombia Mastercard Joven,2.01%,$200.000,$15.490,Mastercard,Online,$6.900,Internacional
1,Tarjeta de Crédito Banco de Occidente Masterca...,No informado,$500.000,$ 16.300,Mastercard,Online,"$6,800",Internacional
2,Tarjeta de Crédito Banco de Occidente Visa Joven,No informado,$500.000,$ 16.300,Visa,Online,"$6,800",Internacional
3,Tarjeta de Crédito Davivienda Visa Liviana,"2,16%",1 SMLV,$20.000,Visa,Presencial,,Internacional
4,Tarjeta de Crédito Davivienda Diners Club Liviana,"2,16%",1 SMLV,$20.000,Diners Club,Presencial,,Internacional
5,Tarjeta de Crédito Davivienda Mastercard Liviana,"2,16%",1 SMLV,$20.000,Mastercard,Presencial,,Internacional
6,Tarjeta de Crédito Banco de Bogotá Biomax Clásica,2.31%,1 SMMLV,$30.790,Visa,Online,$5.560,Internacional
7,Tarjeta de Crédito Banco de Bogotá Movistar Al...,2.31%,1 SMMLV,$26.010,Mastercard,Online,$5.560,Internacional
8,Tarjeta de Crédito Banco de Bogotá Visa Digital,2.31%,1 SMMLV,$24.710,Visa,Online,$5.560,Internacional
9,Tarjeta de Crédito Bancoomeva Coomeva Mastercard,Disponible en la web de Bancoomeva,No informado,Cero cuota de manejo por un año ($0),Mastercard,Online,$6.545,Internacional


## 29 credit cards weren't enough yet. At last, the best decision I could make was to scrape all the available credit cards pages  

# Scraping completed: collecting credit cards from all pages 

In [150]:
# Path to the ChromeDriver executable
ruta_driver_inter = r"C:\Users\santi\chromedriver\chromedriver-win64\chromedriver-win64\chromedriver.exe"

# Configure Chrome options (maximize window for better scraping)
chrome_options = Options()
chrome_options.add_argument("--start-maximized")

# Initialize the Chrome driver with the defined service and options
service_inter_pages = Service(executable_path=ruta_driver_inter)
driver_inter_pages = webdriver.Chrome(service=service_inter_pages, options=chrome_options)

# Create an empty list to store credit card information from all pages
lista_info_tarjetas_inter_pages = []

# Loop through credit card pages from 2 to 8
for n in range(2, 9):
    # Open the page
    driver_inter_pages.get(f"https://www.comparaonline.com.co/tarjeta-de-credito?page={n}")
    # Wait until credit card elements are present in the DOM
    WebDriverWait(driver_inter_pages, 15).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8"))
    )
    # Extra waiting time to ensure full rendering of dynamic content
    time.sleep(5)

    # Find all credit card containers on the current page
    tarjetas_all = driver_inter_pages.find_elements(By.CSS_SELECTOR, "div.w-full.mr-3.px-0.py-8.md\\:mr-8")
    
    # Loop through each credit card container
    for tarjeta in tarjetas_all:
        try:
            # Extract the credit card name
            nombre_tarjeta = tarjeta.find_element(By.TAG_NAME, "a").text
        except:
            
            nombre_tarjeta = "nombre no encontrado"
    
        print(f"\n {nombre_tarjeta}")

        # Extract attributes listed under the card (like fees, interest, etc.)
        ul_elements = tarjeta.find_element(By.TAG_NAME, 'ul')
        atributos = ul_elements.find_elements(By.TAG_NAME, "li")

        # Dictionary to store card name and attributes
        tarjetas_dict_inter = {'nombre_tarjeta':nombre_tarjeta}

        # Loop through attributes and split into "name: value"
        for atributo in atributos:
            try:
                nombre = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[0]
                valor = atributo.find_element(By.TAG_NAME, 'p').text.strip().split(':')[1]
                #valor = atributo.find_element(By.CSS_SELECTOR, 'span[data-testid="attribute-value"]').text.strip()
                tarjetas_dict_inter[nombre] = valor
                print(f'{nombre}:{valor}')
            except:
                continue

        # Append the extracted dictionary to the list
        lista_info_tarjetas_inter.append(tarjetas_dict_inter)





 Tarjeta de Crédito BBVA Visa Oro
Tasa de interés mensual: 2,31%
Ingreso mínimo: No Informado
Cuota de manejo: $34.900
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $10.500

 Tarjeta de Crédito BBVA Visa Clásica
Tasa de interés mensual: 2,31%
Ingreso mínimo: No Informado
Cuota de manejo: $29.600
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $10.500

 Tarjeta de Crédito BBVA Visa Congelada
Tasa de interés mensual: 2,31%
Ingreso mínimo: No Informado
Cuota de manejo: $17.900
Marca: Visa
Solicitar: Online
Costo de avance en efectivo: $10.500

 Tarjeta de Crédito BBVA World Vision Standard
Tasa de interés mensual: 2,31%
Ingreso mínimo: No Informado
Cuota de manejo: $29.600
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $10.500

 Tarjeta de Crédito BBVA Mastercard Black
Tasa de interés mensual: 2,31%
Ingreso mínimo: No Informado
Cuota de manejo: $48.900
Marca: Mastercard
Solicitar: Online
Costo de avance en efectivo: $10.500

 Tarjeta de Crédito B

In [160]:
# Convert the list of dictionaries (with credit card data from pages 2–8) into a DataFrame
df_tarjetas_inter_pages = pd.DataFrame(lista_info_tarjetas_inter)
# Check for missing values in each column of the DataFrame
df_tarjetas_inter_pages.isnull().sum()

nombre_tarjeta                  0
Tasa de interés mensual         0
Ingreso mínimo                  1
Cuota de manejo                 0
Marca                           0
Solicitar                       0
Costo de avance en efectivo    53
Tipo de tarjeta                93
dtype: int64

In [167]:
df_tarjetas_inter_pages

Unnamed: 0,nombre_tarjeta,Tasa de interés mensual,Ingreso mínimo,Cuota de manejo,Marca,Solicitar,Costo de avance en efectivo,Tipo de tarjeta
0,Tarjeta de Crédito Bancolombia Mastercard Joven,2.01%,$200.000,$15.490,Mastercard,Online,$6.900,Internacional
1,Tarjeta de Crédito Banco de Occidente Masterca...,No informado,$500.000,$ 16.300,Mastercard,Online,"$6,800",Internacional
2,Tarjeta de Crédito Banco de Occidente Visa Joven,No informado,$500.000,$ 16.300,Visa,Online,"$6,800",Internacional
3,Tarjeta de Crédito Davivienda Visa Liviana,"2,16%",1 SMLV,$20.000,Visa,Presencial,,Internacional
4,Tarjeta de Crédito Davivienda Diners Club Liviana,"2,16%",1 SMLV,$20.000,Diners Club,Presencial,,Internacional
...,...,...,...,...,...,...,...,...
229,Tarjeta de Crédito Fácil CODENSA MasterCard,2.31%,No informado,"$26,700",MasterCard,Online,"$8,571",
230,Tarjeta de Crédito Fácil CODENSA Verde - Semilla,2.31%,No informado,"$27,700",Tarjetas privadas,Online,"$8,571",
231,Tarjeta de Crédito Scotiabank Colpatria PriceS...,2.31%,No informado,"$27,660",Tarjetas privadas,Online,"$8,571",
232,Tarjeta de Crédito Scotiabank Colpatria Terpel,2.31%,"1,5 SMLV / 2 SMLV","$31,990",Tarjetas privadas,Online,"$8,571",


In [169]:
df_tarjetas_inter_pages.isnull().sum()

nombre_tarjeta                  0
Tasa de interés mensual         0
Ingreso mínimo                  1
Cuota de manejo                 0
Marca                           0
Solicitar                       0
Costo de avance en efectivo    53
Tipo de tarjeta                93
dtype: int64

In [171]:
# Export the DataFrame with credit cards (from pages 2–8) to a CSV file
# The parameter index=False avoids saving the DataFrame index as an extra column
df_tarjetas_inter_pages.to_csv('df_credit_cards.csv', index=False)

## The scraping was completed. As we can see, there are some null values, however these values will be to treated in another notebook focused on data wrangling