In [1]:
from DrissionPage import ChromiumPage, ChromiumOptions
import time
import pandas as pd
from urllib.parse import urljoin

In [2]:
def crawl_link(keyword: str, page: ChromiumPage):
    try:
        # URL tìm kiếm ban đầu
        search_url = f'https://link.springer.com/search?new-search=true&query={keyword}'
        page.get(search_url)

        # Wait for the page to load
        time.sleep(2)

        links = []
        cnt = 1

        while True:
            try:
                # Wait for the page to load
                time.sleep(5)

                # Tìm tất cả các thẻ <a> chứa liên kết bài báo
                link_elements = page.eles("css:li.app-card-open a.app-card-open__link")
                # Trích xuất và xây dựng danh sách các URL tuyệt đối
                for link in link_elements:
                    href = link.attr('href')
                    if href.startswith('/'):
                        full_url = "https://link.springer.com" + href
                    else:
                        full_url = href  # Trường hợp hiếm khi href đã là URL tuyệt đối
                    links.append(full_url)

                # Check for next button
                next_button = page.ele("css:a.eds-c-pagination__link[rel='next']")
                if not next_button:
                    print(f"Reached last page for keyword: {keyword}")
                    break
                
                next_button.click()
                cnt += 1
                
            except Exception as e:
                print(f"Error processing page {cnt}: {str(e)}")
                break

    except Exception as e:
        print(f"Error searching for keyword {keyword}: {str(e)}")
        return []

    return links

In [3]:
import itertools

def combination_keywords(sets):
    """Tạo danh sách các chuỗi từ khóa từ một danh sách các bộ từ khóa."""
    if not sets:
        return []
    combinations = itertools.product(*sets)
    return [' AND '.join(combo) for combo in combinations]

def generate_all_combinations(t2sql, security, llm):
    """Tạo danh sách tất cả các tổ hợp từ khóa theo các trường hợp yêu cầu."""
    # Định nghĩa các trường hợp cần tạo tổ hợp
    cases = [
        [t2sql],                    # Chỉ t2sql
        [t2sql, security],          # t2sql + security
        [t2sql, llm],               # t2sql + llm
        [t2sql, security, llm]      # t2sql + security + llm
    ]
    
    # Tạo và hợp nhất tất cả các tổ hợp
    all_combinations = []
    for case in cases:
        all_combinations.extend(combination_keywords(case))
    
    return all_combinations

In [4]:
def crawl_springer():
    # Initialize empty lists to store all results
    all_links = []

    # option = ChromiumOptions()
    # # Uncomment the following lines to run in headless mode or with specific options
    # option.headless(on_off=True)  # Run in headless mode
    # # Initialize Chromium browser
    option = ChromiumOptions()
    browser = ChromiumPage()

    #Thêm ngoặc chính xác bộ keywords
    t2sql = ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"', '"natural language to sql"', 
             '"semantic parsing to sql"', '"nl to sql"']
    security = ['"security"', '"access control"', '"injection"', '"prompt injection"', 
                '"defense"', '"attack"', '"vulnerability"']
    llm = ['"llm"', '"large language model"']

    keywords = generate_all_combinations(t2sql, security, llm)
    
    for keyword in keywords:
        
        print(f'{keyword} is proccessed......')

        keyword_1 = keyword.strip()
        keyword_1 = keyword_1.replace(" ", "+")
        links = crawl_link(keyword_1, browser)

        # Create DataFrame after collecting all data
        partly = pd.DataFrame()
        partly['link'] = links

        if not partly.empty:
            partly.drop_duplicates(subset=['link'], inplace=True)
            keyword = keyword.replace('"', '')
            keyword = keyword.replace(' ', '_')
            partly.to_csv(f'springer/crawl_by_{keyword}.csv', index=False)

        all_links.extend(links)

    full = pd.DataFrame()
    
    full['link'] = all_links
    
    full.drop_duplicates(subset=['link'], inplace=True)
    full.to_csv('springer/all_springer_papers.csv', index=False)

    # Close the browser
    browser.quit()
    print(f"Total papers collected: {len(full)}")

In [5]:
crawl_springer()

"text-to-sql" is proccessed......
Reached last page for keyword: "text-to-sql"
"nl2sql" is proccessed......
Reached last page for keyword: "nl2sql"
"t2sql" is proccessed......
Reached last page for keyword: "t2sql"
"text2sql" is proccessed......
Reached last page for keyword: "text2sql"
"natural language to sql" is proccessed......
Reached last page for keyword: "natural+language+to+sql"
"semantic parsing to sql" is proccessed......
Reached last page for keyword: "semantic+parsing+to+sql"
"nl to sql" is proccessed......
Reached last page for keyword: "nl+to+sql"
"text-to-sql" AND "security" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"security"
"text-to-sql" AND "access control" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"access+control"
"text-to-sql" AND "injection" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"injection"
"text-to-sql" AND "prompt injection" is proccessed......
Reached last page for keyword: "t

In [6]:
def extract_detail(page: ChromiumPage, link: str):
    
    page.get(link)
    time.sleep(2)

   # Trích xuất link PDF
    pdf_link_element = page.ele("css:a.c-pdf-download__link")
    pdf_link = pdf_link_element.attr('href') if pdf_link_element else None
    if pdf_link and pdf_link.startswith('/'):
        pdf_link = "https://link.springer.com" + pdf_link
    if not pdf_link.lower().endswith(".pdf"): 
        pdf_link = None
    print(f"PDF: {pdf_link}")

    #Trích xuất title
    title_element = page.ele("css:h1.c-article-title", timeout=10)
    # Extract the text if the element exists, otherwise return None
    title = title_element.text if title_element else None
    print(f"Title: {title}")
    
    # Trích xuất authors
    authors_element = page.ele("css:p.c-article-author-affiliation__authors-list")
    authors = authors_element.text if authors_element else None
    print(f"Authors: {authors}")

    # Trích xuất abstract
    abstract_element = page.ele("css:div.c-article-section__content#Abs1-content")
    abstract = abstract_element.text if abstract_element else None
    print(f'Abstract: {abstract}')

    # Trích xuất submitted date
    submitted_element = page.ele("css:span.c-bibliographic-information__value time")
    submitted_date = submitted_element.attr('datetime') if submitted_element else None
    print(f"Submitted_date: {submitted_date}")

    # Trích xuất DOI link
    doi_element = page.ele("css:span.c-bibliographic-information__value")
    doi_link = None
    if doi_element:
        for ele in page.eles("css:span.c-bibliographic-information__value"):
            text = ele.text
            if "10.1007" in text:  # Kiểm tra DOI
                doi_link = text
                break
    if doi_link and not doi_link.startswith('http'):
        doi_link = "https://doi.org/" + doi_link
    print(f"DOI link: {doi_link}")

    return title, authors, pdf_link, abstract, doi_link, submitted_date

In [7]:
import pandas as pd
from DrissionPage import ChromiumPage

paper = pd.read_csv('springer/all_springer_papers.csv')

titles = []
authors = []
pdf_links = []
abstracts = [] 
dois = []
submitted_dates = []

page = ChromiumPage()
for link in paper['link']:

    print(f"Processing link: {link}")

    title, author, pdf_link, abstract, doi, submitted_date = extract_detail(page, link)
    titles.append(title)
    authors.append(author)
    pdf_links.append(pdf_link)
    abstracts.append(abstract)
    dois.append(doi)
    submitted_dates.append(submitted_date)

# Create a DataFrame with the extracted details
paper['pdf_link'] = pdf_links
paper['title'] = titles
paper['authors'] = authors
paper['abstract'] = abstracts
paper['submitted'] = submitted_dates
paper['doi'] = dois
# Save the updated DataFrame to a new CSV file
paper.to_csv('all_springer_papers.csv', index=False)    

Processing link: https://link.springer.com/article/10.1007/s42979-025-03662-6
PDF: None
Title: LLM-Based Text-to-SQL for Real-World Databases
Authors: Eduardo R. Nascimento, Grettel García, Yenier T. Izquierdo, Lucas Feijó, Gustavo M. C. Coelho, Melissa Lemos & Marco A. Casanova
Abstract: Text-to-SQL refers to the task defined as “given a relational databaseDand a natural language sentenceSthat describes a question on D, generate an SQL query QoverDthat expressesS”. Several LLM-based text-to-SQL tools, that is, text-to-SQL tools that explore Large Language Models (LLMs), emerged that outperformed previous approaches on well-known benchmarks. This article first shows that the performance of a selected set of LLM-based text-to-SQL tools is, however, significantly less when run on two challenging databases with a large number of tables, columns, and foreign keys. A closer analysis reveals that one of the problems lie in that the relational schema is an inappropriate specification of the d

In [8]:
page.quit()