In [7]:
from DrissionPage import ChromiumPage, ChromiumOptions
import time
import pandas as pd

In [8]:
def crawl_title_link(keyword: str, page: ChromiumPage):
    try:
        # Open the IEEE Xplore website
        search_input = f'https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={keyword}'
        page.get(search_input)

        # Wait for the page to load
        time.sleep(2)

        titles = []
        links = []
        cnt = 1

        while True:
            try:
                # Wait for the page to load
                time.sleep(5)

                # Find all article links
                tags = page.eles('css:a.fw-bold') or page.ele('css:a')
                
                if not tags:
                    print(f"No results found on page {cnt}")
                    break

                for a_tag in tags:
                    href = a_tag.attr('href')
                    title = a_tag.text
                    if href and title:
                        titles.append(title)
                        links.append(href)
                        print(f'Page {cnt} - Title: {title}')

                # Check for next button
                next_button = page.ele('css:.next-btn')
                if not next_button:
                    print(f"Reached last page for keyword: {keyword}")
                    break
                
                next_button.click()
                cnt += 1
                
            except Exception as e:
                print(f"Error processing page {cnt}: {str(e)}")
                break

    except Exception as e:
        print(f"Error searching for keyword {keyword}: {str(e)}")
        return []

    return links

In [9]:
import itertools

def combination_keywords(sets):
    """Tạo danh sách các chuỗi từ khóa từ một danh sách các bộ từ khóa."""
    if not sets:
        return []
    combinations = itertools.product(*sets)
    return [' AND '.join(combo) for combo in combinations]

def generate_all_combinations(t2sql, security, llm):
    """Tạo danh sách tất cả các tổ hợp từ khóa theo các trường hợp yêu cầu."""
    # Định nghĩa các trường hợp cần tạo tổ hợp
    cases = [
        [t2sql],                    # Chỉ t2sql
        [t2sql, security],          # t2sql + security
        [t2sql, llm],               # t2sql + llm
        [t2sql, security, llm]      # t2sql + security + llm
    ]
    
    # Tạo và hợp nhất tất cả các tổ hợp
    all_combinations = []
    for case in cases:
        all_combinations.extend(combination_keywords(case))
    
    return all_combinations

In [10]:
def crawl_ieee():
    # Initialize empty lists to store all results
    all_links = []

    # option = ChromiumOptions()
    # # Uncomment the following lines to run in headless mode or with specific options
    # option.headless(on_off=True)  # Run in headless mode
    # # Initialize Chromium browser
    option = ChromiumOptions()
    browser = ChromiumPage()

    #Thêm ngoặc chính xác bộ keywords
    t2sql = ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"', '"natural language to sql"', 
             '"semantic parsing to sql"', '"nl to sql"']
    security = ['"security"', '"access control"', '"injection"', '"prompt injection"', '"defense"', '"attack"', '"vulnerability"']
    llm = ['"llm"', '"large language model"']

    keywords = generate_all_combinations(t2sql, security, llm)
    
    for keyword in keywords:
        
        print(f'{keyword} is proccessed......')

        links = crawl_title_link(keyword, browser)

        # Create DataFrame after collecting all data
        partly = pd.DataFrame()
        partly['link'] = links

        if not partly.empty:
            partly.drop_duplicates(subset=['link'], inplace=True)
            keyword = keyword.replace('"', '')
            keyword = keyword.replace(' ', '_')
            partly.to_csv(f'ieee/crawl_by_{keyword}.csv', index=False)

        all_links.extend(links)

    full = pd.DataFrame()
    
    full['link'] = all_links
    
    full.drop_duplicates(subset=['link'], inplace=True)
    full.to_csv('ieee/all_ieee_papers.csv', index=False)

    # Close the browser
    browser.quit()
    print(f"Total papers collected: {len(full)}")

In [11]:
crawl_ieee()

"text-to-sql" is proccessed......
Page 1 - Title: Evaluating Text-to-SQL Model Failures on Real-World Data
Page 1 - Title: Sequential Feature Augmentation for Robust Text-to-SQL
Page 1 - Title: Lifting the Answer: Reranking Candidates on Data Augmented Text-to-SQL
Page 1 - Title: Application of Noise Filter Mechanism for T5-Based Text-to-SQL Generation
Page 1 - Title: Enhancing Text-to-SQL Conversion in Turkish: An Analysis of LLMs with Schema Context
Page 1 - Title: Review of question answering technology based on Text to SQL
Page 1 - Title: TUR2SQL: A Cross-Domain Turkish Dataset For Text-to-SQL
Page 1 - Title: Comparing Accuracy and Consistency: LLMs vs. SOTA Deep Learning Models in Text-to-SQL
Page 1 - Title: Conversational Text-to-SQL: An Odyssey into State-of-the-Art and Challenges Ahead
Page 1 - Title: KG-SQL: Hybrid Knowledge-Guided Semantic Understanding for Text-to-SQL
Page 1 - Title: RH-SQL: Refined Schema and Hardness Prompt for Text-to-SQL
Page 1 - Title: Performance Evalu

In [14]:
def extract_detail(page: ChromiumPage, link: str):
    
    page.get(link)
    time.sleep(2)

    # Extract Title
    title = page.ele('css:h1.document-title').text if page.ele('css:h1.document-title') else None
    print("Title:", title)

    # Extract Authors
    authors_elements = page.eles('css:span.authors-info span span')
    authors = [author.text for author in authors_elements]
    print("Authors:", ", ".join(authors))

    # Extract PDF Link
    pdf_link_element = page.ele('css:a.xpl-btn-pdf')
    pdf_link = pdf_link_element.attr('href') if pdf_link_element else None
    if pdf_link == "javascript:void()":
        pdf_link = None
    print("PDF Link:", pdf_link)

    # Extract Abstract
    abstract_div = page.ele('css:div[xplmathjax]')
    abstract = abstract_div.text.replace("Abstract:", "").strip() if abstract_div else None
    print("Abstract:", abstract)

    # Extract DOI
    doi_element = page.ele('css:div.stats-document-abstract-doi a')
    doi = f'https://doi.org/{doi_element.text}' if doi_element else None
    print("DOI:", doi)

    # Locate the div containing the conference date
    conf_date_div = page.ele('css:div.doc-abstract-confdate')

    # Extract the submission date
    if conf_date_div:
        conf_date_text = conf_date_div.text  # e.g., "Date of Conference: 13-16 May 2024"
        submitted_date = conf_date_text.split(": ", 1)[1].strip()  # Extracts "13-16 May 2024"
    else:
        submitted_date = None

    # Output the result
    print("Submitted Date:", submitted_date)

    return title, authors, pdf_link, abstract, doi, submitted_date

In [15]:
import pandas as pd
import time

paper = pd.read_csv('ieee/all_ieee_papers.csv')

titles = []
authors = []
pdf_links = []
abstracts = [] 
dois = []
submitted_dates = []

page = ChromiumPage()
for link in paper['link']:

    print(f"Processing link: {link}")

    title, author, pdf_link, abstract, doi, submitted_date = extract_detail(page, link)
    titles.append(title)
    authors.append(author)
    pdf_links.append(pdf_link)
    abstracts.append(abstract)
    dois.append(doi)
    submitted_dates.append(submitted_date)

# Create a DataFrame with the extracted details
paper['pdf_link'] = pdf_links
paper['title'] = titles
paper['authors'] = authors
paper['abstract'] = abstracts
paper['submitted'] = submitted_dates
paper['doi'] = dois
# Save the updated DataFrame to a new CSV file
paper.to_csv('ieee/all_ieee_papers.csv', index=False)    

Processing link: https://ieeexplore.ieee.org/document/10598154/
Title: Evaluating Text-to-SQL Model Failures on Real-World Data
Authors: Manasi Ganti, Laurel Orr, Sen Wu
PDF Link: None
Abstract: Text-to-SQL generation models, capable of converting natural language prompts into SQL queries, offer significant potential for streamlining data analytics tasks. Despite state-of-the-art performance on popular academic benchmarks such as Spider [1], recent large language models, such as GPT-4, exhibit a considerable performance degradation on real-world applications with longer, more convoluted schemas [2]. This disparity raises questions about what factors contribute to this drop and whether existing academic benchmarks are effective for representing real-world challenges. To determine these factors, we first examine Text-to-SQL model failures on customer logs. We find that accuracy on customer logs was on average 30% lower than accuracy on Spider. We identify three main challenges in real-wo

In [16]:
page.quit()