In [2]:
!pip install selenium
!pip install selenium
!pip install selenium

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [1]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, TimeoutException, WebDriverException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.action_chains import ActionChains

from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
from user_agent import generate_user_agent, generate_navigator
from selenium_stealth import stealth

import polars as pl

from tqdm.auto import tqdm
import time
import os


In [3]:
### Utility and Function Setup
def maindriver(url):
    useragent_rotate = generate_user_agent(navigator='chrome')

    # Adding Selenium WebDriver Option
    options = Options()
    # options.add_argument('--headless=new')
    options.add_argument('--start-maximized')
    options.add_argument('--incognito')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disk-cache-size=0')
    # options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument('user-agent='+useragent_rotate)
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('--disable-notification')
    options.add_argument('--disable-geolocation')

    driver = webdriver.Chrome(
        options=options,
        service=ChromeService(ChromeDriverManager().install())
    )
    # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    # Using Selenium Stealth to avoid bot detection
    stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win64",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
    )

    driver.get(url)

    return driver

In [4]:
# Creating Empty datasets directory
if not os.path.exists('./datasets'):
    os.makedirs('./datasets', exist_ok=True)

# Creating Empty skipped.json file
if not os.path.exists('./datasets/skipped.json'):
    with open('./datasets/skipped.json', 'w') as file_path:
        pass

In [34]:
# initiate base url
url_base = "https://www.fortiguard.com/encyclopedia?type=ips&risk={level:n}&page={i:n}"
# initiate url for open selenium driver
url_driver = url_base.format(level=1, i=1)

In [37]:
# Loading Selenium WebDriver
for x in range(0, 15):
    str_error = None
    try:
        driver = maindriver(url_driver)
    except Exception as e:
        str_error = e
        pass

    if str_error:
        print('Connecting Bot Fail, Try again....')
        time.sleep(2)  # wait for 2 seconds before trying to fetch the data again
    else:
        break

time.sleep(2.5)
driver.refresh()
time.sleep(2)
print('Bot Connected')

Mozilla/5.0 (X11; Ubuntu; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.117 Safari/537.36
Bot Connected


In [39]:
skipped_page_dict = {}

total_level = 5
for level in range(1, total_level+1):
    # Load url of the risk level's first page
    if driver.current_url != url_base.format(level=level, i=1):
        driver.get(url_base.format(level=level, i=1))
        time.sleep(0.5)
    
    # Get total page for each risk level
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, f"#full-page > section.table-body > div > nav > ul.pagination.pagination-desktop > li:nth-last-child(2)"))
    )
    total_page = driver.find_element(By.CSS_SELECTOR, f"#full-page > section.table-body > div > nav > ul.pagination.pagination-desktop > li:nth-last-child(2)").text
    total_page = int(total_page)
    print(f"Total Page for Risk Level {level}: {total_page}")

    # Create empty dataframe of each risk level
    schema_column = {
        'title': pl.String,
        'link':pl.String,
    }
    df_risk_level = pl.DataFrame(schema=schema_column)
    # Create tuple of skipped page
    list_skipped_page = []
    
    # Iterate Each Page
    for page in range(1, total_page+1):
        for _ in range(10):
            # Case if the driver's url already changing
            if driver.current_url == url_base.format(level=level, i=page):
                time.sleep(0.5)
                break
            else:
                driver.get(url_base.format(level=level, i=page))
                time.sleep(0.5)

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"#full-page > section.table-body > div.container"))
            )
        except TimeoutException as e:
            list_skipped_page.append(page)
            continue

        page_dict = {
            'title': [],
            'link': [],
        }
    
        # Get list of article container
        articles = driver.find_elements(By.CSS_SELECTOR, f"#full-page > section.table-body > div > div.row")    
        # Iterate each article
        for article in articles:
            # Store title
            page_dict['title'].append(article.find_element(By.CSS_SELECTOR, f"div.col-lg > b").text)
    
            # Store link
            link = article.get_attribute('onclick')
            link = "www.fortiguard.com" + link.split()[-1][1:-1]
            page_dict['link'].append(link)
        time.sleep(0.25)
    
        # Store dictionary data to csv
        df_page = pl.from_dict(page_dict)
        df_risk_level = df_risk_level.vstack(df_page)

    df_risk_level.write_csv(f'./datasets/forti_lists_{level}.csv')
    skipped_page_dict[level] = tuple(list_skipped_page)

    print(f"Total data gathered: {df_risk_level.n_unique()}")
    print(f"List of skipped page: {tuple(list_skipped_page)}")
    time.sleep(0.5)

Total Page for Risk Level 1: 7
Total data gathered: 125
List of skipped page: ()
Total Page for Risk Level 2: 34
Total data gathered: 663
List of skipped page: ()
Total Page for Risk Level 3: 171
Total data gathered: 3400
List of skipped page: (171,)
Total Page for Risk Level 4: 396
Total data gathered: 7911
List of skipped page: ()
Total Page for Risk Level 5: 251
Total data gathered: 5000
List of skipped page: (251,)
