In [1]:
# Import các crawler tools
import sys
import os

# Import các crawler classes
from crawl_tools.acm_crawler import ACMCrawler
from crawl_tools.arxiv_crawler import ArxivCrawler
from crawl_tools.ieee_crawler import IEEECrawler
from crawl_tools.mdpi_crawler import MDPICrawler
from crawl_tools.science_direct_crawler import ScienceDirectCrawler
from crawl_tools.springer_crawler import SpringerCrawler

In [2]:
keyword_sets = {
        't2sql': ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"'] 
        #          '"natural language to sql"', '"semantic parsing to sql"', '"nl to sql"'],
        # 'security': ['"security"', '"access control"', '"injection"', 
        #             '"prompt injection"', '"defense"', '"attack"', '"vulnerability"'],
        # 'llm': ['"llm"', '"large language model"']
    }

In [None]:
# Cấu hình chung cho tất cả các crawler
CRAWL_CONFIG = {
    'headless': False,          # Chạy browser ẩn
    'max_threads': 4,          # Số threads tối đa (giảm để tránh bị block)
    'use_multithreading': True # Sử dụng multithreading
}

# Danh sách các crawler và thư mục output tương ứng
CRAWLERS = {
    'acm': {
        'output_dir': 'output/acm'
    },
    'arxiv': {
        'output_dir': 'output/arxiv'
    },
    'ieee': {
        'output_dir': 'output/ieee'
    },
    'mdpi': {
        'output_dir': 'output/mdpi'
    },
    'science_direct': {
        'output_dir': 'output/science_direct'
    },
    'springer': {
        'output_dir': 'output/springer'
    }
}

print(f"📋 Đã cấu hình {len(CRAWLERS)} crawler tools")


In [None]:
with ArxivCrawler(
            headless=CRAWL_CONFIG['headless'],
            output_dir=CRAWLERS['arxiv']['output_dir'],
            max_threads=CRAWL_CONFIG['max_threads'],
            keyword_sets=keyword_sets
        ) as crawler:
            results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
            print(f"🎉 Crawling completed! Results: {len(results)} keyword searches")

In [None]:
# 🎯 ACM Digital Library Crawler (với xử lý cookie tự động)
print("🚀 Starting ACM Digital Library crawling...")

with ACMCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['acm']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 ACM Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 IEEE Xplore Crawler
print("🚀 Starting IEEE Xplore crawling...")

with IEEECrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['ieee']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 IEEE Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 MDPI Crawler
print("🚀 Starting MDPI crawling...")

with MDPICrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['mdpi']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 MDPI Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 Science Direct Crawler
print("🚀 Starting Science Direct crawling...")

with ScienceDirectCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['science_direct']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Science Direct Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 Springer Crawler (với xử lý cookie tự động)
print("🚀 Starting Springer crawling...")

with SpringerCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['springer']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Springer Crawling completed! Results: {len(results)} keyword searches")


In [None]:
print("🚀 Starting MDPI crawling...")
with MDPICrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWL_CONFIG['springer']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results=crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 MDPI Crawling completed! Results: {len(results)} keyword searches")