# Web Scraping

## Setup

### Format and Style

In [None]:
# Notebook formatting
%load_ext jupyter_black

### Configuration

In [None]:
ROOT_URL = "https://swift-codes.org/all-country/"
EXAMPLE_URL = (
    "https://swift-codes.org/ireland/bank-of-america-dublin-swift-codes-bofaie3xxxx/"
)

### Imports

In [None]:
import pandas as pd
import selenium

# Load selenium components
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

### Global Constants and Variables

In [None]:
DRIVER_PATH = "/Users/danielcoll/Library/Drivers/chromedriver"

### Functions

### Main

In [None]:
# Establish chrome driver and go to report site URL
driver = webdriver.Chrome(DRIVER_PATH)

In [None]:
def scrap_first_level_links():
    driver.get(ROOT_URL)
    # Retrieve list of first level bic country links - This must be customized
    country_list = driver.find_element(By.CLASS_NAME, "country-list")
    lis = country_list.find_elements(By.TAG_NAME, "li")
    links = []
    for li in lis:
        a = li.find_element(By.TAG_NAME, "a")
        link = a.get_attribute("href")
        links.append(link)
    return links


first_level_links = scrap_first_level_links()
first_level_links

In [None]:
# # Retrieve bic list from country link
# test_url = "https://swift-codes.org/list/algeria/"

# driver.get(test_url)

In [None]:
def scrap_second_level_links(first_level_links):
    second_level_links = []
    for first_level_link in first_level_links:
        # Access to first level link and wait until the page is fully loaded
        try:
            driver.get(first_level_link)
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "myDynamicElement"))
            )
        except:
            print("Something went wrong loading the page")
        # Retrieve list of second level bic country links - This must be customized
        post_titles = driver.find_elements(By.CLASS_NAME, "post-title.entry-title")
        for post_title in post_titles:
            a = post_title.find_elements(By.TAG_NAME, "a")
            for elem in a:
                second_level_link = elem.get_attribute("href")
                second_level_links.append(second_level_link)

    return second_level_links


second_level_links = scrap_second_level_links(first_level_links[0:2])
second_level_links

In [None]:
# # Retrieve bics - This must be customized
# test_url = "https://swift-codes.org/bulgaria/bulgarian-bank-swift-code-bulgaria/"

# driver.get(test_url)

In [None]:
len([1, 2])

In [None]:
list(range(2))

In [None]:
def scrap_bic_dataset_from_html_table(links):
    for index, link in zip(list(range(len(links))), links):
        # Access to first level link and wait until the page is fully loaded
        print(index, link)
        try:
            driver.get(link)
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "myDynamicElement"))
            )
        except:
            print("Something went wrong loading the page")
        # TODO Retrieve data when page has a different html format - Some links are not extracting the data
        # Retrieve bics - This must be customized
        table = driver.find_element(By.TAG_NAME, "table")
        df_bic = pd.read_html(table.get_attribute("outerHTML"), header=0)[0]
        df_bic.to_csv(f"../data/intermediary/{index}.csv", index=False)


scrap_bic_dataset_from_html_table(second_level_links)

In [None]:
driver.quit()