## Importing Libraries

In [1]:
#Basic Imports
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

#Text Preprocessing
import re
import json
from json2html import *

# Web Scraping and parsing
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as BraveService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## Setting up Selenium Webdriver

### Brave

In [2]:
# Define Brave path
brave_path = "C:/Program Files/BraveSoftware/Brave-Browser/Application/brave.exe"
options = webdriver.ChromeOptions()
options.binary_location = brave_path

# Create new automated instance of Brave
driver = webdriver.Chrome(service=BraveService(ChromeDriverManager(chrome_type=ChromeType.BRAVE).install()),options = options)

### Chrome

In [3]:
# options = webdriver.ChromeOptions()
# options.add_argument("--start-maximized")
# options.add_argument('--log-level=3'

# # Create new automated instance of Chrome
# driver = webdriver.Chrome(executable_path="chromedriver", chrome_options=options)

## Functions to collect links to all the landing pages

In [4]:
black_listed = ['http://www.cisco.com/c/en/us/td/docs/general/whatsnew/whatsnew.html', 'http://www.cisco.com/go/cfn',
                'https://cfnng.cisco.com/', 'https://tools.cisco.com/Support/CLILookup', 
                'https://www.cisco.com/c/en/us/td/docs/ios-xml/ios/mcl/allreleasemcl/all-book.html']

def search_page(link, landing_links):
    if link != None: driver.get(link)
    further_links = []
    try:
        page_search_area = driver.find_element(By.ID, "pageContentDiv")
        psa = page_search_area.find_elements(By.TAG_NAME, "a")
        for lin in psa:
            lnk = lin.get_attribute("href")
            if lnk != link and (lnk != None) and not (lnk in black_listed) and not('#' in lnk) and not(lnk in landing_links):
                further_links.append(lnk)
    except:
        pass
        
    try:
        s1 = driver.find_element(By.TAG_NAME, "tr")
        switch_term = s1.find_elements(By.XPATH, "//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'step')]")
    except:
        switch_term = []
        
    if(switch_term):
        landing_page = True
    else:
        landing_page = False
    
    
    return landing_page, further_links


def get_landing_links(start_page_link):

    index_links = []
    landing_page_links = []
    index_links.append(start_page_link)

    while(True):

        new_links = []
        for link in index_links:
            landing_page, further_links = search_page(link, landing_page_links)
            
            if(landing_page):
                landing_page_links.append(link)
            else:
                 new_links += further_links

        index_links = new_links.copy()


        if(len(index_links) == 0):
            break

    return landing_page_links

## Parsing Function to scrape the landing pages

In [10]:
def parsePage(links):
    feature_configs = dict()

    for link in links:
        driver.get(link)
        elements = driver.find_elements(By.XPATH, "//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'summary steps')]")
        if (len(elements)):
            for element in elements:
                headings = []
                for i in range(1,7):
                    try:
                        headings.append(element.find_element(By.XPATH, f"./preceding::h{i}[1]").location["y"])
                    except:
                        pass
                idx = headings.index(max(headings))+1
                heading = element.find_element(By.XPATH, f"./preceding::h{idx}[1]")
                while ('procedure' in heading.text.lower() or 'before' in heading.text.lower()):
                    try:
                        headings[idx-1] = heading.find_element(By.XPATH, f"./preceding::h{idx}[1]").location["y"]
                    except:
                        headings[idx-1] = -1
                    idx = headings.index(max(headings))+1
                    heading = heading.find_element(By.XPATH, f"./preceding::h{idx}[1]")
                ol = element.find_element(By.XPATH, "./following-sibling::ol[1]") #assumption
                lis = ol.find_elements(By.CSS_SELECTOR, "li")
                feature_configs[heading.text] = [li.text for li in lis]
                # if (ol.text.split('\n')[-1] == ol.text.split('\n')[-2]):
                #     feature_configs[heading.text] = ol.text.split('\n')[:-1]
                # else:
                #     feature_configs[heading.text] = ol.text.split('\n')
        else:
            elements = driver.find_elements(By.CSS_SELECTOR, "table")
            if (len(elements)):
                for element in elements:
                    headings = []
                    for i in range(1,7):
                        try:
                            headings.append(element.find_element(By.XPATH, f"./preceding::h{i}[1]").location["y"])
                        except:
                            pass
                    idx = headings.index(max(headings))+1
                    heading = element.find_element(By.XPATH, f"./preceding::h{idx}[1]")
                    while ('procedure' in heading.text.lower() or 'before' in heading.text.lower()):
                        try:
                            headings[idx-1] = heading.find_element(By.XPATH, f"./preceding::h{idx}[1]").location["y"]
                        except:
                            headings[idx-1] = -1
                        idx = headings.index(max(headings))+1
                        heading = heading.find_element(By.XPATH, f"./preceding::h{idx}[1]")

                    if element.get_attribute("outerHTML").lower().count('step') >= 3:
                        lst = []
                        for row in element.find_elements(By.CSS_SELECTOR, "tr")[1:]:
                            if row.text.lower().split(' ')[0] == 'note':
                                continue
                            string = row.find_elements(By.CSS_SELECTOR, "td")[1].text.lower()
                            word = "example"
                            result = string.split(word)[0]
                            lst.append(re.sub("\n", "", result))
                        feature_configs[heading.text] = lst

    return feature_configs

## Final Pipeline

We have 2 methods to update the database, we can either use the link of the config guide (which is essentially a book that leads to several landing pages) or directly use the link of the landing page

In [11]:
# CONFIG GUIDE METHOD
# url = "" #Link to a config guide
# links = get_landing_links(url)

# lANDING PAGE METHOD
url = "https://www.cisco.com/c/en/us/td/docs/switches/lan/catalyst9400/software/release/16-6/configuration_guide/ip/b_166_ip_9400_cg/b_165_ip_9300_9500_cg_chapter_00.html" #Link to a landing page
links = [url]

feature_configs = parsePage(links)

In [14]:
json_object = json.dumps(feature_configs, indent = 4) 
html_code = json2html.convert(json = json_object)
with open("example.html", "w") as f:
    f.write(html_code)
    f.close()
    
with open("example.json", "w") as f:
    f.write(json_object)
    f.close()