# Web Scraping

## Setup

### Format and Style

In [None]:
# Notebook formatting
%load_ext jupyter_black

### Configuration

In [None]:
DRIVER_PATH = "/Users/danielcoll/Library/Drivers/chromedriver"
ROOT_URL = "https://swift-codes.org/all-country/"

### Imports

In [39]:
import pandas as pd
import selenium
import json

# Load selenium components
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

### Global Constants and Variables

### Functions

In [None]:
def scrap_first_level_links(driver):
    driver.get(ROOT_URL)
    # Retrieve list of first level bic country links
    country_list = driver.find_element(By.CLASS_NAME, "country-list")
    lis = country_list.find_elements(By.TAG_NAME, "li")
    links = []
    for li in lis:
        a = li.find_element(By.TAG_NAME, "a")
        link = a.get_attribute("href")
        links.append(link)
    return links


def scrap_second_level_links(driver, first_level_links):
    second_level_links = []
    for first_level_link in first_level_links:
        # Access to first level link and wait until the page is fully loaded
        try:
            driver.get(first_level_link)
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "myDynamicElement"))
            )
        except:
            print("Something went wrong loading the page")
        # Retrieve list of second level bic country links
        post_titles = driver.find_elements(By.CLASS_NAME, "post-title.entry-title")
        for post_title in post_titles:
            a = post_title.find_elements(By.TAG_NAME, "a")
            for elem in a:
                second_level_link = elem.get_attribute("href")
                second_level_links.append(second_level_link)

    return second_level_links


def scrap_bic_dataset_from_html_table(driver, links):
    for index, link in zip(list(range(len(links))), links):
        # Access to first level link and wait until the page is fully loaded
        print(index, link)
        try:
            try:
                driver.get(link)
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "myDynamicElement"))
                )
            except:
                print("Something went wrong loading the page")

            # Retrieve bic datasets
            tables = driver.find_elements(By.TAG_NAME, "table")
            for i, table in enumerate(tables):
                if i == 0:
                    df = pd.read_html(table.get_attribute("outerHTML"), header=0)[0]
                    df_0_column_names = df.columns.tolist()
                    print(df_0_column_names)

                else:

                    df_n = pd.read_html(table.get_attribute("outerHTML"))[0]
                    print(df_n)
                    df_n.columns = df_0_column_names
                    df = df.append(df_n)
            df.to_csv(f"../data/intermediary/{index}.csv", index=False)
        except:
            pass

### Main

In [None]:
# Establish chrome driver and go to report site URL
driver = webdriver.Chrome(DRIVER_PATH)
# Extract first level links
first_level_links = scrap_first_level_links(driver)
# Extract second level links
second_level_links = scrap_second_level_links(driver, first_level_links)
# Extract data from link endpoints
scrap_bic_dataset_from_html_table(driver, second_level_links)
# Close driver
driver.quit()

In [75]:
# Retrieve bic numbers from csv files
bic_numbers = []
for i in list(range(370)):
    try:
        df = pd.read_csv(f"../data/intermediary/{i}.csv")
        columns = df.columns.to_list()
        for column in columns:
            bic_keywords = ["BIC", "bic", "SWIFT", "swift"]
            if any(substring in column for substring in bic_keywords):
                # print(df[column].loc[df[column].notnull()].tolist())
                bic_numbers += df[column].loc[df[column].notnull()].tolist()
    except:
        pass
bic_numbers[0:5]

['HSBCDZA1XXX', 'PRBAARBABAH', 'PRBAARBADIV', 'PRBAARBALPT', 'PRBAARBAMP1']

In [78]:
# Save bic number list to json file
json_object = json.dumps(bic_numbers)
with open("../data/final/bic_numbers.json", "w") as outfile:
    outfile.write(json_object)