In [1]:
from DrissionPage import ChromiumPage, ChromiumOptions
import time
import pandas as pd
import random

In [9]:
def extract_paper_details(list_item: ChromiumPage) -> dict:
    """
    Extracts details from a single Science Direct paper result.
    Args:
        result (DrissionPage): The DrissionPage object representing the paper result.
    Returns:    
        dict: A dictionary containing the extracted details.
    """
    
    try:
        doi = list_item.attr('data-doi') if list_item.attr('data-doi') else None
        
        title_anchor = list_item.ele('css:a.anchor.result-list-title-link')
        if title_anchor:
            title = title_anchor.text if title_anchor.text else None
            link = title_anchor.attr('href') if title_anchor.attr('href') else None
    
        pdf_anchor = list_item.ele('css:a.anchor.download-link') if list_item.ele('css:a.anchor.download-link') else None
        if pdf_anchor:
            pdf_link = pdf_anchor.attr('href') if pdf_anchor.attr('href') else None
        else: pdf_link = None

        print(f"Extracted DOI: {doi}")
        print(f"Extracted Title: {title}")  
        print(f"Extracted Link: {link}")    
        print(f"Extracted PDF Link: {pdf_link}")    

    except Exception as e:
        print(f"Error extracting details: {e}")

    return {
        'link': link,
        'pdf_link': pdf_link if pdf_link else None,
        'title': title,
        'doi': f'https://doi.org/{doi}' if doi else None,
    }

In [10]:
def paper_crawl(keywords: str) -> list:
    """
    Crawls arXiv for papers based on the given keywords.
    Args:
        keywords (str): The keywords to search for.     
    Returns:
        list: A list of dictionaries containing paper details.
    """

    #List of paper data
    papers_data = []

    # Navigate to the search page
    # Construct the search URL
    search_url = f'https://www.sciencedirect.com/search?qs={keywords}'
    print(search_url)
    
    # option = ChromiumOptions()
    # # Uncomment the following lines to run in headless mode or with specific options
    # option.headless(on_off=True)  # Run in headless mode
    # # Initialize the page
    page = ChromiumPage()
    try:
        page.get(search_url)
        cnt = 1
        while True:
            # Wait for the page to load fully
            page.wait.ele_displayed('css:li.ResultItem.col-xs-24.push-m', timeout=3)            
            print(f"Page {cnt} loaded, extracting data...")

            # Target the result element
            list_item = page.eles('css:li.ResultItem.col-xs-24.push-m')

            # Loop through each result and extract details
            for result in list_item:
                paper_data = []
                try:
                    paper_data = extract_paper_details(result)
                    papers_data.append(paper_data)
                except Exception as e:
                    print(f"Error extracting data from a result: {e}")

            # Check if the "Load More" button is present
            next_button = page.ele('css:a[data-aa-name="srp-next-page"]')
            if next_button:
                next_button.click()
                cnt += 1
                print(f"Loading next page...")
            else:
                print("No more pages to load.")
                break

    except Exception as e:
        print(f"Error: {e}")
    
    finally:
        page.quit()
        return papers_data

In [11]:
def save_to_csv(data: list, filename: str) -> None:
    """
    Save the extracted data to a CSV file.
    """
    try:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['link'], inplace=True)
        df.to_csv(f"science_direct/{filename}", index=False)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

In [12]:
import itertools

def combination_keywords(sets):
    """Tạo danh sách các chuỗi từ khóa từ một danh sách các bộ từ khóa."""
    if not sets:
        return []
    combinations = itertools.product(*sets)
    return [' AND '.join(combo) for combo in combinations]

def generate_all_combinations(t2sql, security, llm):
    """Tạo danh sách tất cả các tổ hợp từ khóa theo các trường hợp yêu cầu."""
    # Định nghĩa các trường hợp cần tạo tổ hợp
    cases = [
        [t2sql],                    # Chỉ t2sql
        [t2sql, security],          # t2sql + security
        [t2sql, llm],               # t2sql + llm
        [t2sql, security, llm]      # t2sql + security + llm
    ]
    
    # Tạo và hợp nhất tất cả các tổ hợp
    all_combinations = []
    for case in cases:
        all_combinations.extend(combination_keywords(case))
    
    return all_combinations

In [13]:
def crawl_science_direct():
    """
    Main function to run the pipeline.
    """

    #Thêm ngoặc chính xác bộ keywords
    t2sql = ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"', '"natural language to sql"', 
             '"semantic parsing to sql"', '"nl to sql"']
    security = ['"security"', '"access control"', '"injection"', '"prompt injection"', '"defense"', '"attack"', '"vulnerability"']
    llm = ['"llm"', '"large language model"']

    keywords = generate_all_combinations(t2sql, security, llm)

    tmp = []

    # Crawl papers
    for keyword in keywords:
        print(f"Searching for papers related to: {keyword}")
        keyword_1 = keyword.strip()
        keyword_1 = keyword_1.replace(" ", "%20")
        papers_data = paper_crawl(keyword_1)

        if len(papers_data) > 0:
            keyword = keyword.replace('"', '')
            keyword = keyword.replace(' ', '_')
            save_to_csv(papers_data, f"crawl_by_{keyword}.csv")

        tmp.extend(papers_data)

    if tmp:
        # Save data to CSV
        save_to_csv(tmp, "all_science_direct_papers.csv")
        print(f"Extracted {len(tmp)} papers.")


In [14]:
crawl_science_direct()

Searching for papers related to: "text-to-sql"
https://www.sciencedirect.com/search?qs="text-to-sql"
Page 1 loaded, extracting data...
Extracted DOI: 10.1016/j.patrec.2025.04.016
Extracted Title: SPS-SQL: Enhancing Text-to-SQL generation on small-scale LLMs with pre-synthesized queries
Extracted Link: https://www.sciencedirect.com/science/article/pii/S0167865525001497
Extracted PDF Link: None
Extracted DOI: 10.1016/j.patcog.2025.111800
Extracted Title: Graph-empowered Text-to-SQL generation on Electronic Medical Records
Extracted Link: https://www.sciencedirect.com/science/article/pii/S0031320325004601
Extracted PDF Link: None
Extracted DOI: 10.1016/j.nlp.2025.100135
Extracted Title: Fine-tuning text-to-SQL models with reinforcement-learning training objectives
Extracted Link: https://www.sciencedirect.com/science/article/pii/S2949719125000111
Extracted PDF Link: https://www.sciencedirect.com/science/article/pii/S2949719125000111/pdfft?md5=435c497abba7c389db6999717a31c263&pid=1-s2.0-S2

In [2]:
def get_paper_details(link: str, page: ChromiumPage) -> dict:
    """
    Function to get paper details from a specific Science Direct paper.
    """
    
    try:
        page.get(link)
        time.sleep(3)
        
        # Extract details
        title = page.ele('css:.title-text').text if page.ele('css:.title-text') else None

        given_name = page.ele('css:.given-name').text if page.ele('css:.given-name') else None

        sur_name = page.ele('css:.text.surname').text if page.ele('css:.text.surname') else None

        fullname = given_name + ' ' + sur_name if given_name and sur_name else None

        doi = page.ele('css:.anchor.doi.anchor-primary').attr('href') if page.ele('css:.anchor-text') else None

        abstract_element = page.ele('css:.abstract.author') if page.ele('css:.abstract.author') else None
        abstract_text = abstract_element.ele('css:.u-margin-s-bottom').text if abstract_element.ele('css:.u-margin-s-bottom') else None

        # Step 1: Click the "Show More" button
        page.ele("#show-more-btn").click() if page.ele("#show-more-btn") else page.ele("css:button#show-more-btn").click()
        # Step 2: Select the banner element by its ID
        banner = page.ele("#banner") if page.ele("#banner") else page.ele("css:div#banner")
        # Step 3: Wait for the dates paragraph to load within the banner
        time.sleep(1)
        # Step 4: Select the paragraph containing the dates
        dates_paragraph = banner.ele("css:p.u-margin-s-bottom", index=1) if banner.ele("css:p.u-margin-s-bottom", index=1) else banner.ele("css:div.u-margin-s-bottom", index=1)
        # Step 5: Extract the text with the dates
        dates = dates_paragraph.text if dates_paragraph else None

        return title, fullname, doi, abstract_text, dates
    except Exception as e:
        print(f"Error: {e}")
        return None, None, None, None, None

In [3]:
df = pd.read_csv("science_direct/all_science_direct_papers.csv")

In [4]:
title_lst = []
fullname_lst = []
doi_lst = []
abstract_lst = []
dates = []

option = ChromiumOptions()
# Uncomment the following lines to run in headless mode or with specific options
option.headless(on_off=True)  # Run in headless mode
# Initialize Chromium browser
page = ChromiumPage()

for link in df['link']:
    print(f"Processing link: {link}")
    title, fullname, doi, abstract_text, date = get_paper_details(link=link,page=page)
    title_lst.append(title)
    fullname_lst.append(fullname)
    doi_lst.append(doi)
    abstract_lst.append(abstract_text)
    dates.append(date)

page.quit()

print("All paper details extracted.")

df['title_re'] = title_lst
df['fullname_re'] = fullname_lst
df['doi_re'] = doi_lst
df['abstract_re'] = abstract_lst
df['submitted'] = dates
df.to_csv("all_science_direct_papers.csv", index=False)

Processing link: https://www.sciencedirect.com/science/article/pii/S0167865525001497
Processing link: https://www.sciencedirect.com/science/article/pii/S0031320325004601
Processing link: https://www.sciencedirect.com/science/article/pii/S2949719125000111
Processing link: https://www.sciencedirect.com/science/article/pii/S0925231224020642
Processing link: https://www.sciencedirect.com/science/article/pii/S0950705124013315
Processing link: https://www.sciencedirect.com/science/article/pii/S0306457325000780
Processing link: https://www.sciencedirect.com/science/article/pii/S0306457324003376
Processing link: https://www.sciencedirect.com/science/article/pii/S2291969421000843
Processing link: https://www.sciencedirect.com/science/article/pii/S2352340922004152
Processing link: https://www.sciencedirect.com/science/article/pii/S092523122101345X
Processing link: https://www.sciencedirect.com/science/article/pii/S089360802100277X
Processing link: https://www.sciencedirect.com/science/article/pi