In [1]:
from SpiderV2 import Spider
from IndexV2 import RawInfoIndex, Index, Database
from Tokenize import Tokenize

import concurrent.futures
import threading
import time

raw_storage = RawInfoIndex()
tokenizer = Tokenize()
db = Database(SearchEngine="ForSpiderTest")

In [2]:
def generate_and_crawl(spider:Spider, raw_index:RawInfoIndex, thread_num:int, crawl_lock:threading.Lock, index_lock:threading.Lock):
    try:
        url = Spider.queue[Spider.queue_front]
        assert spider.generate_next_soup()
    except: return

    crawl_lock.acquire()
    raw_text, links, hash = spider.crawl({}) 
    #print("Thread",thread_num,"URL in queue:", len(Spider.queue) - Spider.queue_front,"|","URL in crawled:", len(Spider.crawled), f"Scraped {spider.url}")
    crawl_lock.release()

    text = tokenizer.tokenize(raw_text)
    text = tokenizer.filter(text)
    counter = tokenizer.make_counter(text)

    index_lock.acquire()
    raw_index.modify_index(url, raw_text, links, hash)
    index_lock.release()

def main(raw_index:RawInfoIndex):
    crawl_lock = threading.Lock()
    index_lock = threading.Lock()
    spider = Spider("https://iot-kmutnb.github.io/blogs")
    Spider.set_base_domains(["https://iot-kmutnb.github.io/"])
    Spider.max_depth = 2
    generate_and_crawl(spider, raw_index, 1, crawl_lock, index_lock)
    spider_nest = [Spider() for i in range(7)]
    spider_nest.append(spider)
    print(Spider.queue, end = "\n------------------------------------------\n")
    print(Spider.crawled)
    while Spider.queue_front != len(Spider.queue):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            for i in range(len(spider_nest)):
                executor.submit(generate_and_crawl, spider_nest[i], raw_index, i+1, crawl_lock, index_lock)

if __name__ == "__main__":
    start = time.time()
    main(raw_storage)
    raw_storage.save_to_database(db)
    print(raw_storage.index)
    stop = time.time()
    print(stop-start)

['https://iot-kmutnb.github.io/blogs', 'https://iot-kmutnb.github.io/blogs/electronics/ac_circuit_analysis', 'https://iot-kmutnb.github.io/blogs/esp32/esp_at_firmware', 'https://iot-kmutnb.github.io/blogs/rpi-rp2040/rp2040_wifi_esp-at', 'https://iot-kmutnb.github.io/blogs/seeed_xiao/xiao_samd21', 'https://iot-kmutnb.github.io/blogs/mbedos/mbed-stm32-part-3', 'https://iot-kmutnb.github.io/blogs/arduino/avr_gcc_part-1', 'https://iot-kmutnb.github.io/blogs/arduino/avr_gcc_part-3', 'https://iot-kmutnb.github.io/blogs/training/c_tutorial_part-2', 'https://iot-kmutnb.github.io/blogs/electronics/rlc_circuits', 'https://iot-kmutnb.github.io/blogs/electronics/mesh_and_nodal_analysis', 'https://iot-kmutnb.github.io/blogs/sensors/hc-sr04', 'https://iot-kmutnb.github.io/blogs/mbedos/vscode-pio-blackpill_mbed', 'https://iot-kmutnb.github.io/blogs/rpi/rpi4b_headless', 'https://iot-kmutnb.github.io/blogs/electronics/oscilloscopes', 'https://iot-kmutnb.github.io/blogs/tinkercad', 'https://iot-kmutnb.g

In [3]:
print(Spider.unaccessible_urls)

['https://www.mkdocs.org/', 'https://think-embedded.gitbook.io/micropython/', 'https://fb.me/iot.kmutnb', 'https://code.visualstudio.com/', 'https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack', 'https://www.mkdocs.org/', 'https://code.visualstudio.com/download', 'https://code.visualstudio.com/docs/editor/intellisense', 'https://www.arduino.cc/reference/en/language/functions/analog-io/analogwrite/', 'https://espressif-docs.readthedocs-hosted.com/projects/arduino-esp32/en/latest/api/ledc.html', 'https://www.mkdocs.org/', 'https://wokwi.com/projects/', 'https://github.com/espressif/arduino-esp32', 'https://github.com/WeActStudio/WeActStudio.ESP32C3CoreBoard', 'https://www.mkdocs.org/', 'https://docs.espressif.com/projects/arduino-esp32/en/latest/getting_started.html', 'https://docs.espressif.com/projects/esptool/en/latest/esp32c3/advanced-topics/boot-mode-selection.html', 'https://www.mkdocs.org/', 'https://www.arduino.cc/en/software', 'https: