In [None]:
import pandas as pd
import os
from crawl_tools.acm_crawler import ACMCrawler
from crawl_tools.arxiv_crawler import ArxivCrawler
from crawl_tools.ieee_crawler import IEEECrawler
from crawl_tools.science_direct_crawler import ScienceDirectCrawler
from crawl_tools.springer_crawler import SpringerCrawler
from filter_tools.content_analysis import ContentAnalyzer
from filter_tools.download_papers import PaperDownloader
from filter_tools.keywords_filter_paper import KeywordFilter

In [None]:
output_dir = "crawl_results"
keyword_sets = {
    'functional testing': ['"Software Testing"', '"Testing"', '"Test Automation"', '"Test Case Generation"', '"Test Script Generation"', '"Test Data Generation"', '"Test Oracle Generation"', '"Test Repair"'],
    'functional': ['"Functional Testing"', '"System Testing"', '"End-to-End Testing"', '"GUI Testing"', '"UI Testing"', '"Web Testing"', '"Mobile Testing"', 
                '"Agent"', '"AI Agent"', '"Autonomous Agent"', '"Prompt Engineering"', '"Chain-of-Thought"', '"Retrieval-Augmented Generation"'],
    'llm': ['"llm"', '"large language model"']
}

os.makedirs(output_dir, exist_ok=True)
CRAWL_CONFIG = {
    'headless': False,     
    'max_threads': 4,          
    'use_multithreading': True 
}
CRAWLERS = {
    'acm': {
        'output_dir': 'output/acm'
    },
    'arxiv': {
        'output_dir': 'output/arxiv'
    },
    'ieee': {
        'output_dir': 'output/ieee'
    },
    'mdpi': {
        'output_dir': 'output/mdpi'
    },
    'science_direct': {
        'output_dir': 'output/science_direct'
    },
    'springer': {
        'output_dir': 'output/springer'
    }
}

print(f"📋 Đã cấu hình {len(CRAWLERS)} crawler tools")


In [None]:
with ArxivCrawler(
            headless=CRAWL_CONFIG['headless'],
            output_dir=CRAWLERS['arxiv']['output_dir'],
            max_threads=CRAWL_CONFIG['max_threads'],
            keyword_sets=keyword_sets
        ) as crawler:
            results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
            print(f"🎉 Crawling completed! Results: {len(results)} keyword searches")

In [None]:
print("🚀 Starting ACM Digital Library crawling...")

with ACMCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['acm']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 ACM Crawling completed! Results: {len(results)} keyword searches")


In [None]:
print("🚀 Starting IEEE Xplore crawling...")

with IEEECrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['ieee']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 IEEE Crawling completed! Results: {len(results)} keyword searches")


In [None]:
print("🚀 Starting Science Direct crawling...")

with ScienceDirectCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['science_direct']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Science Direct Crawling completed! Results: {len(results)} keyword searches")


In [None]:
print("🚀 Starting Springer crawling...")

with SpringerCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['springer']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Springer Crawling completed! Results: {len(results)} keyword searches")


In [None]:
print("🔎 Starting keyword filtering...")

filtered_output_dir = os.path.join(output_dir, "filtered")
os.makedirs(filtered_output_dir, exist_ok=True)

keyword_filter = KeywordFilter(
    input_csv=output_dir,
    output_dir=filtered_output_dir
)

filter_keywords = [
    "software testing", "test automation", "llm", "large language model",
    "ai agent", "prompt engineering"
]

filtered_papers = keyword_filter.filter_papers(keywords=filter_keywords, min_keyword_occurrences=2)
print(f"✅ Filtered {len(filtered_papers)} papers containing relevant keywords")


In [15]:
# 📥 Downloading papers for further analysis
print("📥 Starting paper download process...")

# Define download directory
download_dir = os.path.join(output_dir, "downloaded_papers")
os.makedirs(download_dir, exist_ok=True)

# Initialize the paper downloader
downloader = PaperDownloader(
    input_files=filtered_papers,  # Use the filtered papers from previous step
    output_dir=download_dir,
    max_papers=50  # Limit the number of papers to download
)

# Download papers
downloaded_papers = downloader.download_papers()
print(f"📚 Downloaded {len(downloaded_papers)} papers for analysis")


📥 Starting paper download process...


NameError: name 'filtered_papers' is not defined

In [None]:
# 📊 Analyzing paper content
print("🔬 Starting content analysis...")

# Initialize the content analyzer
analyzer = ContentAnalyzer(
    input_dir=download_dir,
    output_dir=os.path.join(output_dir, "analysis")
)

# Define analysis parameters
analysis_params = {
    "extract_titles": True,
    "extract_abstracts": True,
    "extract_keywords": True,
    "summarize_content": True,
    "perform_sentiment_analysis": True,
    "extract_methodology": True
}

# Perform analysis
analysis_results = analyzer.analyze_papers(**analysis_params)

# Display summary of analysis
print(f"✅ Completed analysis of {len(analysis_results)} papers")

# Create a summary dataframe for visualization
if analysis_results:
    summary_df = pd.DataFrame(analysis_results)
    print("\n📈 Analysis Summary:")
    print(f"- Total papers analyzed: {len(summary_df)}")
    if 'methodology' in summary_df.columns:
        methodology_counts = summary_df['methodology'].value_counts()
        print(f"- Top methodologies: {methodology_counts.head(3).to_dict()}")
    if 'sentiment' in summary_df.columns:
        sentiment_avg = summary_df['sentiment'].mean()
        print(f"- Average sentiment score: {sentiment_avg:.2f}")
        
    # Save analysis results
    summary_df.to_csv(os.path.join(output_dir, "analysis_summary.csv"), index=False)
    print("💾 Analysis summary saved to analysis_summary.csv")
