In [2]:
import pandas as pd
import os
import json
import importlib
from pathlib import Path
from crawl_tools.acm_crawler import ACMCrawler
from crawl_tools.arxiv_crawler import ArxivCrawler
from crawl_tools.ieee_crawler import IEEECrawler
from crawl_tools.mdpi_crawler import MDPICrawler
from crawl_tools.science_direct_crawler import ScienceDirectCrawler
from crawl_tools.springer_crawler import SpringerCrawler
from filter_tools.content_analysis import ContentAnalyzer
from filter_tools.download_papers import PaperDownloader
from filter_tools.keywords_filter_paper import KeywordFilter

# ======== GLOBAL CONFIGURATION VARIABLES ========
# These variables can be modified to test specific parts of the pipeline

# Directory structure
BASE_OUTPUT_DIR = "output"
CRAWL_SOURCE = "arxiv"  # Change this to test different crawlers
INPUT_DIR = os.path.join(BASE_OUTPUT_DIR, CRAWL_SOURCE)
FILTERED_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, f"filtered_{CRAWL_SOURCE}")
DOWNLOAD_DIR = os.path.join(FILTERED_OUTPUT_DIR, "downloaded_papers")
ANALYSIS_DIR = os.path.join(FILTERED_OUTPUT_DIR, "analysis")
SUMMARY_DIR = os.path.join(FILTERED_OUTPUT_DIR, "summary")

# Ensure directories exist
for directory in [INPUT_DIR, FILTERED_OUTPUT_DIR, DOWNLOAD_DIR, ANALYSIS_DIR, SUMMARY_DIR]:
    os.makedirs(directory, exist_ok=True)

# Keywords for crawling
KEYWORD_SETS = {
    'functional_testing': ['"Functional Testing"',
                          '"Software Testing"',
                          '"Test Case Generation"',
                          '"Test Data Generation"',
                          '"Test Automation Frameworks"',
                          '"UI (User Interface) Testing"',
                          '"API (Application Programming Interface) Testing"',
                          '"Test Oracle Problem"',
                          '"Test Coverage"',
                          '"Test Maintenance"',
                          '"Bug Detection"',
                          '"Software Quality Assurance" (SQA)',
                          '"Regression Testing"'],
    'llm': ['"llm"', '"large language model"']
}

# Keywords for filtering
FILTER_KEYWORDS = [
    "software testing", "test automation", "llm", "large language model",
    "ai agent", "prompt engineering", "test case generation", "functional testing",
    "test oracle", "test coverage", "bug detection", "regression testing"
]

# API keys for content analysis (leave empty to use environment variable)
API_KEYS = []

# Similarity thresholds for keyword matching
SIMILARITY_THRESHOLDS = {
    "exact": 100,
    "high": 90,
    "medium": 75,
    "low": 65
}

# Paper download limits
MAX_PAPERS_TO_DOWNLOAD = 20
PAPERS_START_DATE = "2024-01-01"  # Focus on recent papers

# Crawler configuration
CRAWL_CONFIG = {
    'headless': False,     
    'max_threads': 4,          
    'use_multithreading': True 
}

# File paths for specific crawl outputs
ALL_PAPERS_CSV = os.path.join(INPUT_DIR, f"all_{CRAWL_SOURCE}_papers.csv")
SUMMARY_FILE = os.path.join(SUMMARY_DIR, "functional_testing_llm_true.csv")

# Helper variables for simulation when cells aren't run
filtered_papers_fallback = [f for f in os.listdir(INPUT_DIR) 
                          if f.endswith('.csv') and f.startswith('crawl_by_')] if os.path.exists(INPUT_DIR) else []
filtered_papers_fallback = [os.path.join(INPUT_DIR, f) for f in filtered_papers_fallback]

# Helper function to check and get variables without errors
def get_var(var_name, default=None):
    """Helper function to safely get variables"""
    if var_name in globals():
        return globals()[var_name]
    else:
        return default

In [3]:
output_dir = "crawl_results"
max_threads = 4
headless = False
keyword_sets = {
    'functional testing': ['"Functional Testing"',
'"Software Testing"',
'"Test Case Generation"',
'"Test Data Generation"',
'"Test Automation Frameworks"',
'"UI (User Interface) Testing"',
'"API (Application Programming Interface) Testing"',
'"Test Oracle Problem"',
'"Test Coverage"',
'"Test Maintenance"',
'"Bug Detection"',
'"Software Quality Assurance" (SQA)',
'"Regression Testing"'],
    'llm': ['"llm"', '"large language model"']
}

os.makedirs(output_dir, exist_ok=True)
CRAWL_CONFIG = {
    'headless': False,     
    'max_threads': 4,          
    'use_multithreading': True 
}
CRAWLERS = {
    'acm': {
        'output_dir': 'output/acm'
    },
    'arxiv': {
        'output_dir': 'output/arxiv'
    },
    'ieee': {
        'output_dir': 'output/ieee'
    },
    'mdpi': {
        'output_dir': 'output/mdpi'
    },
    'science_direct': {
        'output_dir': 'output/science_direct'
    },
    'springer': {
        'output_dir': 'output/springer'
    }
}

print(f"📋 Đã cấu hình {len(CRAWLERS)} crawler tools")


📋 Đã cấu hình 6 crawler tools


In [None]:
with ArxivCrawler(
            headless=CRAWL_CONFIG['headless'],
            output_dir=CRAWLERS['arxiv']['output_dir'],
            max_threads=CRAWL_CONFIG['max_threads'],
            keyword_sets=keyword_sets
        ) as crawler:
            results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
            print(f"🎉 Crawling completed! Results: {len(results)} keyword searches")

In [None]:
# 🎯 ACM Digital Library Crawler (với xử lý cookie tự động)
print("🚀 Starting ACM Digital Library crawling...")

with ACMCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['acm']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 ACM Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 IEEE Xplore Crawler
print("🚀 Starting IEEE Xplore crawling...")

with IEEECrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['ieee']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 IEEE Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 Science Direct Crawler
print("🚀 Starting Science Direct crawling...")

with ScienceDirectCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['science_direct']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Science Direct Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🎯 Springer Crawler (với xử lý cookie tự động)
print("🚀 Starting Springer crawling...")

with SpringerCrawler(
    headless=CRAWL_CONFIG['headless'],
    output_dir=CRAWLERS['springer']['output_dir'],
    max_threads=CRAWL_CONFIG['max_threads'],
    keyword_sets=keyword_sets
) as crawler:
    results = crawler.crawl_complete(use_multithreading=CRAWL_CONFIG['use_multithreading'])
    print(f"🎉 Springer Crawling completed! Results: {len(results)} keyword searches")


In [None]:
# 🔍 Using KeywordFilter to filter papers by keywords
print("🔎 Starting keyword filtering...")

# Use our global configuration variables
print(f"Input Directory: {INPUT_DIR}")
print(f"Output Directory: {FILTERED_OUTPUT_DIR}")

# Initialize the keyword filter with the configured directories
keyword_filter = KeywordFilter(
    input_dir=INPUT_DIR,
    output_dir=FILTERED_OUTPUT_DIR
)

# Set similarity thresholds from our configuration
keyword_filter.set_similarity_thresholds(
    exact=SIMILARITY_THRESHOLDS["exact"],
    high=SIMILARITY_THRESHOLDS["high"], 
    medium=SIMILARITY_THRESHOLDS["medium"],
    low=SIMILARITY_THRESHOLDS["low"]
)

# Show the keywords we're using
print(f"Using {len(FILTER_KEYWORDS)} keywords for filtering:")
print(f"- {', '.join(FILTER_KEYWORDS[:5])}... and {len(FILTER_KEYWORDS)-5} more")

# Run the filtering - this is the main function we're testing
try:
    # If the input directory doesn't exist or is empty, we'll simulate the output
    if not os.path.exists(INPUT_DIR) or len(os.listdir(INPUT_DIR)) == 0:
        print(f"⚠️ Input directory {INPUT_DIR} doesn't exist or is empty. Using simulated data.")
        filtered_papers = []
    else:
        # Apply filtering with configurable minimum occurrences
        filtered_papers = keyword_filter.filter_papers(
            keywords=FILTER_KEYWORDS, 
            min_keyword_occurrences=1
        )
    
    print(f"✅ Filtered {len(filtered_papers)} papers containing relevant keywords")
    
    # Also process all papers if the file exists
    if os.path.exists(ALL_PAPERS_CSV):
        print(f"\n🔍 Running analysis on all papers dataset: {ALL_PAPERS_CSV}")
        
        # Process the entire dataset
        summaries = keyword_filter.run_pipeline(
            input_csv=ALL_PAPERS_CSV,
            output_dir=SUMMARY_DIR,
            min_keyword_occurrences=1
        )
        
        print(f"📊 Complete analysis saved to {SUMMARY_DIR}")
    else:
        print(f"⚠️ All papers CSV not found at {ALL_PAPERS_CSV}")
        print("You can run the crawler first or manually place a CSV file there.")
except Exception as e:
    print(f"❌ Error during filtering: {e}")
    # Use our fallback values
    print("Using fallback filtered papers...")
    filtered_papers = filtered_papers_fallback


In [4]:
# 📥 Downloading papers for further analysis
print("📥 Starting paper download process...")

print(f"Download Directory: {DOWNLOAD_DIR}")

# Initialize the paper downloader with the output directory
downloader = PaperDownloader(output_dir=DOWNLOAD_DIR)

# Get filtered papers from previous step or use fallback
filtered_papers_to_use = get_var('filtered_papers', filtered_papers_fallback)

def download_from_csv_files(csv_files):
    """Helper function to download papers from multiple CSV files"""
    if not csv_files or len(csv_files) == 0:
        print("⚠️ No CSV files provided for downloading")
        return []
        
    filtered_dfs = []
    for paper_path in csv_files:
        try:
            if os.path.exists(paper_path):
                paper_df = pd.read_csv(paper_path)
                filtered_dfs.append(paper_df)
                print(f"Loaded {len(paper_df)} papers from {os.path.basename(paper_path)}")
            else:
                print(f"File not found: {paper_path}")
        except Exception as e:
            print(f"Error loading {paper_path}: {e}")
    
    downloaded_results = []
    if filtered_dfs:
        try:
            # Combine all DataFrames
            combined_df = pd.concat(filtered_dfs, ignore_index=True)
            
            # Remove duplicates based on pdf_link
            if 'pdf_link' in combined_df.columns:
                # Get papers with valid PDF links
                valid_links_df = combined_df[combined_df['pdf_link'].notna()]
                unique_df = valid_links_df.drop_duplicates(subset=['pdf_link'])
                
                if len(unique_df) > 0:
                    # Apply date filtering if 'submitted' column exists
                    if 'submitted' in unique_df.columns:
                        try:
                            unique_df['submitted'] = pd.to_datetime(unique_df['submitted'], errors='coerce')
                            unique_df = unique_df.sort_values(by='submitted', ascending=False)
                            unique_df = unique_df[unique_df['submitted'] >= pd.to_datetime(PAPERS_START_DATE)]
                            print(f"Filtered to {len(unique_df)} papers published after {PAPERS_START_DATE}")
                        except Exception as e:
                            print(f"Date filtering error: {e}")
                
                    # Limit to avoid too many downloads
                    if len(unique_df) > MAX_PAPERS_TO_DOWNLOAD:
                        print(f"Limiting to {MAX_PAPERS_TO_DOWNLOAD} most recent papers")
                        unique_df = unique_df.head(MAX_PAPERS_TO_DOWNLOAD)
                        
                    # Download the papers
                    if len(unique_df) > 0:
                        print(f"Starting download of {len(unique_df)} papers...")
                        downloaded_results = downloader.download_papers(unique_df, pdf_link_column='pdf_link')
                        print(f"📚 Downloaded {len(downloaded_results)} papers")
                    else:
                        print("⚠️ No papers to download after filtering")
                else:
                    print("⚠️ No unique PDF links found")
            else:
                print("⚠️ No 'pdf_link' column found in papers")
        except Exception as e:
            print(f"Error processing DataFrames: {e}")
    return downloaded_results

try:
    # First try to use filtered papers from previous step
    if filtered_papers_to_use and len(filtered_papers_to_use) > 0:
        print(f"Using {len(filtered_papers_to_use)} filtered paper files from previous step")
        downloaded_papers = download_from_csv_files(filtered_papers_to_use)
        
        # If we didn't get any downloads, try the summary file
        if len(downloaded_papers) == 0:
            print(f"\nAttempting to use summary file instead...")
            if os.path.exists(SUMMARY_FILE):
                print(f"Found summary file: {SUMMARY_FILE}")
                downloaded_papers = download_from_csv_files([SUMMARY_FILE])
            else:
                print(f"⚠️ Summary file not found at: {SUMMARY_FILE}")
    else:
        print("No filtered papers available from previous step")
        
        # Try using the summary file if it exists
        if os.path.exists(SUMMARY_FILE):
            print(f"Using summary file: {SUMMARY_FILE}")
            downloaded_papers = download_from_csv_files([SUMMARY_FILE])
        # Otherwise try using any CSV files in the input directory
        elif os.path.exists(INPUT_DIR):
            csv_files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) 
                        if f.endswith('.csv') and 'crawl_by_' in f]
            if csv_files:
                print(f"Using {len(csv_files)} CSV files found in {INPUT_DIR}")
                downloaded_papers = download_from_csv_files(csv_files)
            else:
                print(f"⚠️ No suitable CSV files found in {INPUT_DIR}")
                downloaded_papers = []
        else:
            print(f"⚠️ No input sources available. Please run keyword filtering first or manually place CSV files.")
            downloaded_papers = []

except Exception as e:
    print(f"❌ Error during paper download: {e}")
    downloaded_papers = []

# Save the variable for use in the next cell
print(f"\n📝 Total downloaded papers: {len(downloaded_papers)}")
if len(downloaded_papers) > 0:
    # Check if any PDFs were actually saved to disk
    pdf_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.lower().endswith('.pdf')] if os.path.exists(DOWNLOAD_DIR) else []
    print(f"📁 PDF files in download directory: {len(pdf_files)}")
    
    # If no PDFs are in the directory, we need to create a dummy example for testing
    if len(pdf_files) == 0 and len(downloaded_papers) > 0:
        print("⚠️ No PDF files found in download directory despite successful downloads.")
        print("This might be due to download errors or permission issues.")


📥 Starting paper download process...
Download Directory: output\filtered_arxiv\downloaded_papers
Using 21 filtered paper files from previous step
Loaded 1500 papers from crawl_by_large_language_model.csv
Loaded 38 papers from crawl_by_large_language_model_AND_Bug_Detection.csv
Loaded 8 papers from crawl_by_large_language_model_AND_Functional_Testing.csv
Loaded 7 papers from crawl_by_large_language_model_AND_Regression_Testing.csv
Loaded 2 papers from crawl_by_large_language_model_AND_Software_Quality_Assurance_(SQA).csv
Loaded 74 papers from crawl_by_large_language_model_AND_Software_Testing.csv
Loaded 55 papers from crawl_by_large_language_model_AND_Test_Case_Generation.csv
Loaded 28 papers from crawl_by_large_language_model_AND_Test_Coverage.csv
Loaded 3 papers from crawl_by_large_language_model_AND_Test_Data_Generation.csv
Loaded 1 papers from crawl_by_large_language_model_AND_Test_Maintenance.csv
Loaded 350 papers from crawl_by_llm.csv
Loaded 46 papers from crawl_by_llm_AND_Bug_Det

  unique_df['submitted'] = pd.to_datetime(unique_df['submitted'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['submitted'] = pd.to_datetime(unique_df['submitted'], errors='coerce')


In [5]:
# 📊 Analyzing paper content
print("🔬 Starting content analysis...")
print(f"Analysis Directory: {ANALYSIS_DIR}")

# Get reference to downloaded papers or create empty list for testing
downloaded_papers_to_use = get_var('downloaded_papers', [])

def analyze_pdf_directory(pdf_dir, output_dir, use_api=True):
    """Helper function to analyze PDFs in a directory with fallback options"""
    results = {'analyzed_files': [], 'analysis_data': []}
    
    # Check if directory exists and contains PDF files
    if not os.path.exists(pdf_dir):
        print(f"⚠️ PDF directory not found: {pdf_dir}")
        return results
        
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    if not pdf_files:
        print(f"⚠️ No PDF files found in {pdf_dir}")
        return results
        
    print(f"Found {len(pdf_files)} PDF files to analyze")
    
    # Check if we should try to use the Gemini API
    if use_api:
        try:
            # Check if the google.genai module is available
            import importlib.util
            genai_spec = importlib.util.find_spec("google.genai")
            has_genai = genai_spec is not None
            
            if has_genai:
                # Try to get API key from config or environment
                api_keys_to_use = API_KEYS
                if not api_keys_to_use:
                    import os
                    env_key = os.environ.get('GOOGLE_API_KEY')
                    if env_key:
                        api_keys_to_use = [env_key]
                        print("Using API key from GOOGLE_API_KEY environment variable")
                
                if api_keys_to_use:
                    # Initialize analyzer and process files
                    print("Using Gemini API for content analysis")
                    analyzer = ContentAnalyzer(api_keys=api_keys_to_use)
                    results['analyzed_files'] = analyzer.process_directory(pdf_dir)
                    
                    # If successful, extract data from the JSON files
                    if results['analyzed_files']:
                        print(f"✅ Successfully analyzed {len(results['analyzed_files'])} papers with Gemini API")
                        
                        for json_file in results['analyzed_files']:
                            try:
                                with open(json_file, 'r', encoding='utf-8') as f:
                                    data = json.load(f)
                                    paper_info = {
                                        'title': data.get('paper_identification', {}).get('title', 'Unknown'),
                                        'authors': data.get('paper_identification', {}).get('authors', 'Unknown'),
                                        'publication': data.get('paper_identification', {}).get('publication_venue_year', 'Unknown'),
                                        'problem': data.get('abstract_analysis', {}).get('problem_statement', {}).get('analysis', ''),
                                        'methodology': data.get('methodology_analysis', {}).get('ai_techniques_used', {}).get('analysis', ''),
                                        'results': data.get('results_analysis', {}).get('quantitative_results', {}).get('analysis', '')
                                    }
                                    results['analysis_data'].append(paper_info)
                            except Exception as e:
                                print(f"Error processing {json_file}: {e}")
                        
                        return results
            
            print("Gemini API not available or not configured. Falling back to basic extraction.")
        except Exception as e:
            print(f"Error with Gemini API: {e}")
            print("Falling back to basic extraction")
    
    # Basic extraction fallback
    try:
        import PyPDF2
        print(f"Performing basic extraction from {len(pdf_files)} PDF files")
        
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            try:
                with open(pdf_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    
                    # Get text from first page
                    text = ""
                    if reader.pages:
                        text = reader.pages[0].extract_text() or ""
                    
                    # Try to extract title and other info from text
                    lines = text.split('\n')
                    probable_title = lines[0] if lines else pdf_file
                    
                    file_info = {
                        'title': probable_title[:100],
                        'filename': pdf_file,
                        'authors': ' '.join(lines[1:3]) if len(lines) > 1 else 'Unknown',
                        'publication': 'Unknown',
                        'pages': len(reader.pages),
                        'problem': text[:200] + "..." if len(text) > 200 else text,
                        'methodology': 'Basic extraction only',
                        'results': 'Basic extraction only'
                    }
                    results['analysis_data'].append(file_info)
                    # Save the extraction path
                    results['analyzed_files'].append(pdf_path)
            except Exception as e:
                print(f"Error extracting from {pdf_file}: {e}")
    except ImportError:
        print("⚠️ PyPDF2 module not found. Cannot perform basic extraction.")
        print("To install: pip install PyPDF2")
        
    return results

try:
    # First check if we have actual downloaded papers from the previous step
    if downloaded_papers_to_use and len(downloaded_papers_to_use) > 0:
        print(f"Using {len(downloaded_papers_to_use)} downloaded papers from previous step")
    
    # Try to analyze PDFs in the download directory
    if os.path.exists(DOWNLOAD_DIR):
        analysis_results = analyze_pdf_directory(DOWNLOAD_DIR, ANALYSIS_DIR)
        
        # Create summary dataframe if we have data
        if analysis_results['analysis_data']:
            analysis_data = analysis_results['analysis_data']
            summary_df = pd.DataFrame(analysis_data)
            print(f"\n📈 Analysis Summary:")
            print(f"- Total papers analyzed: {len(summary_df)}")
            
            # Save analysis results
            summary_csv = os.path.join(ANALYSIS_DIR, "analysis_summary.csv")
            summary_df.to_csv(summary_csv, index=False)
            print(f"💾 Analysis summary saved to {summary_csv}")
        else:
            print("⚠️ No analysis data was collected")
    else:
        print(f"⚠️ Download directory not found: {DOWNLOAD_DIR}")
        
        # Try to create test data for demonstration
        print("\n🔍 Creating demo analysis data for testing...")
        demo_data = [
            {
                'title': 'Example Paper 1: Using LLMs for Test Generation',
                'authors': 'Jane Smith, John Doe',
                'publication': 'Conference on AI Testing, 2023',
                'problem': 'This paper addresses the challenge of automating test case generation using LLMs',
                'methodology': 'The authors use a fine-tuned GPT model with prompt engineering techniques',
                'results': 'The approach achieved 85% accuracy in generating valid test cases'
            },
            {
                'title': 'Example Paper 2: AI-based Test Oracle Generation',
                'authors': 'Alex Johnson, Maria Garcia',
                'publication': 'Journal of Software Testing, 2022',
                'problem': 'Creating reliable test oracles remains challenging for complex software systems',
                'methodology': 'The paper proposes a novel chain-of-thought approach for test oracle generation',
                'results': 'The proposed method reduced false positives by 40% compared to baseline approaches'
            }
        ]
        
        # Save demo data
        demo_df = pd.DataFrame(demo_data)
        demo_csv = os.path.join(ANALYSIS_DIR, "demo_analysis.csv")
        demo_df.to_csv(demo_csv, index=False)
        print(f"💾 Demo analysis data saved to {demo_csv}")

except Exception as e:
    print(f"❌ Error during content analysis: {e}")


🔬 Starting content analysis...
Analysis Directory: output\filtered_arxiv\analysis
❌ Error during content analysis: cannot access local variable 'os' where it is not associated with a value
