In [1]:
import pandas as pd
import pickle
# Read nvd_cve_df_sample4x25 from the pickle file
with open('random_sampled_100CVEs_names.pkl', 'rb') as file:
    nvd_cve_df_sample4x25 = pickle.load(file)

from duckduckgo_search import DDGS

def search_urls(query, mode='D', max_results=20):
    try:
        if mode == 'D':
            results = DDGS().text(query,safesearch='off', max_results=max_results)
            urls = [i['href'] for i in results if 'href' in i]
            urls={"only_url":urls,'full_result':results}
        else:
            raise ValueError("Invalid mode. Use 'D' for DuckDuckGo ")
        return urls
    except Exception as e:
        logging.error(f'Error searching URLs: {e}')
        return []
    
import pandas as pd
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

def download_urls(urls, driver_path='/usr/bin/chromedriver', render_wait_time=20, filename='downloaded_urls'):
    temp_file = f'df_download_urls_temp_{filename}.xlsx'
    final_file = f'df_download_urls_{filename}.xlsx'
    df = pd.DataFrame(urls, columns=['url'])
    df['content'] = None
    df['attempt'] = 0

    def job():
        if df[(df['attempt'] <= 3) & (df['content'].isnull())].empty:
            print("All URLs with attempt <= 3 have been processed, stopping the task.")
            return
        
        count = len(df[(df['content'].isnull()) & (df['attempt'] <= 3)])
        print(f"There are currently {count} URLs pending processing.")
        
        sample_urls = df[(df['content'].isnull()) & (df['attempt'] <= 3)].sample(min(18, len(df[(df['content'].isnull()) & (df['attempt'] <= 3)])))
        df.loc[df['url'].isin(sample_urls['url']), 'attempt'] += 1
        print("Selected URLs for this batch:", sample_urls['url'].tolist())
        
        with ProcessPoolExecutor() as executor:
            future_to_url = {executor.submit(fetch_url_content_with_selenium_and_jinareader, url, driver_path, render_wait_time): url for url in sample_urls['url']}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    url, content = future.result()
                    if "failed fetch_url_content_with_selenium_and_jinareader" not in content:
                        df.loc[df['url'] == url, 'content'] = content
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")
        
    start_time = time.time()
    start_seconds = (start_time + 3) % 60
    
    print("Start time:", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)))
    while True:
        if df[(df['attempt'] <= 3) & (df['content'].isnull())].empty:
            print("All URLs with attempt <= 3 have been processed, stopping the task.")
            break 
        if int(start_seconds) == int(time.time()) % 60:
            print("Starting this batch of work.")
            job()
            df.to_excel(temp_file, index=False)
            print("Number of URLs processed so far:", len(df[df['content'].notnull()]))
            print("Saving intermediate file.")
    df.to_excel(final_file, index=False)
    print("Saving final file.")   
    return df

def fetch_url_content_with_selenium_and_jinareader(source_url, driver_path, render_wait_time):
    full_url = "https://r.jina.ai/" + source_url
    print("fetch_url_content_with_selenium_and_jinareader starting work on:", full_url)
    start_time = time.time()
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--ignore-certificate-errors')
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(full_url)
        WebDriverWait(driver, render_wait_time).until(
            EC.presence_of_element_located((By.TAG_NAME, "html"))
        )
        time.sleep(3)
        content = driver.page_source
        
        if 'blocked until' and 'due to previous abuse found on' in content:
            print("Alert! Domain has been blocked. Content:", content)
            return source_url, 'failed fetch_url_content_with_selenium_and_jinareader get blocked'
        if "Per IP rate limit exceeded" in content:
            print("Alert! IP rate limit exceeded. Content:", content)
            return source_url, 'failed fetch_url_content_with_selenium_and_jinareader IP rate limit exceeded'
        if 'Slow down, turbo' in content:
            print("Alert! Speed too fast. Content:", content)
            return source_url, 'failed fetch_url_content_with_selenium_and_jinareader speed too fast'
        if '</body></html>' not in content:
            print("Alert! Content not complete. Content:", content)
            return source_url, 'failed fetch_url_content_with_selenium_and_jinareader content not complete'
        print("fetch_url_content_with_selenium_and_jinareader successfully fetched content.")
        return source_url, content
    
    except TimeoutException as te:
        print("fetch_url_content_with_selenium_and_jinareader timed out:", te)
        return source_url, 'failed fetch_url_content_with_selenium_and_jinareader time out'
    
    except Exception as e:
        print("fetch_url_content_with_selenium_and_jinareader encountered an error:", e)
        return source_url, 'failed fetch_url_content_with_selenium_and_jinareader unknown error'
    
    finally:
        end_time = time.time()
        print("fetch_url_content_with_selenium_and_jinareader runtime:", end_time - start_time)
        driver.quit()

import sys, pickle, os, json, re, time, random, logging, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, scipy, sklearn, networkx as nx, importlib;

import pandas as pd  # type: ignore
import tools
import time
import signal

class TimeoutException(Exception):
    pass

def handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, handler)

def domain_set_count(urls, blacklist=None):
    from urllib.parse import urlparse
    return len(set(urlparse(url).netloc for url in urls))

def domain_count(urls, blacklist=None):
    from urllib.parse import urlparse
    return len([urlparse(url).netloc for url in urls])

cve_to_crawl = nvd_cve_df_sample4x25.CVE.tolist()
cve_to_crawl = cve_to_crawl[0:3]

cve_search_results = {}
cve_search_content = {}
for single_cve in cve_to_crawl:
    try:
        print('working on', single_cve)
        signal.alarm(300)
        search_results = search_urls(str("\"" + single_cve + "\""), max_results=40)
        cve_content = {
            "full name": single_cve,
            "urls result": search_results['only_url'],
            "full search results": search_results['full_result'],
            "domain count": domain_count(search_results['only_url']),
            "domain set count": domain_set_count(search_results['only_url'])
        }
        with open("crawler_realtime_data.txt", "a") as file:
            file.write(f"Current time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}\n")
            file.write(f"Completed search for {single_cve}, found {len(search_results['only_url'])} URLs, "
                       f"with {domain_count(search_results['only_url'])} unique domains, and {domain_set_count(search_results['only_url'])} unique domain sets\n")
            file.write(f"Search result URLs: {search_results['only_url']}\n")
        cve_search_results[single_cve] = cve_content

        # Start crawling
        df_content = download_urls(search_results['only_url'], filename=single_cve)

        cve_search_content[single_cve] = df_content
        with open("crawler_realtime_data.txt", "a") as file:
            file.write(f"Completed downloading for {single_cve}, downloaded {len(df_content)} files\n")
        print(f'End time: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))}')

        # Cancel the timer
        signal.alarm(0)

        # Wait for 10 seconds
        time.sleep(10)

    except TimeoutException:
        with open("error_elements.txt", "a") as error_file:
            error_file.write(f"{single_cve}\n")
        print(f"{single_cve} exceeded time limit, skipping.")
        signal.alarm(0)