In [13]:
from DrissionPage import ChromiumPage, ChromiumOptions
import time
import pandas as pd
import random
from urllib.parse import urljoin

In [14]:
def crawl_link(keyword: str, page: ChromiumPage):
    try:
        # Open the ACM Digital Library website
        search_input = f'https://dl.acm.org/action/doSearch?AllField={keyword}'
        page.get(search_input)

        # Wait for the page to load
        time.sleep(2)

        links = []
        cnt = 1
        base_url = "https://dl.acm.org"
        while True:
            try:
                # Wait for the page to load
                time.sleep(random.randint(2,5))

                # Tìm phần tử <a> chứa đường link bằng CSS selector
                link_elements = page.eles("css:.issue-item__title a")
                
                if not link_elements:
                    print(f"No results found on page {cnt}")
                    break

                for link in link_elements:
                    relative_link = link.attr("href")
                    # Tạo đường link đầy đủ bằng cách kết hợp với URL cơ sở
                    full_link = urljoin(base_url, relative_link)
                    if full_link:
                        links.append(full_link)

                # Check for next button
                next_button = page.ele("css:a.pagination__btn--next")
                if not next_button:
                    print(f"Reached last page for keyword: {keyword}")
                    break
                
                next_button.click()
                cnt += 1
                
            except Exception as e:
                print(f"Error processing page {cnt}: {str(e)}")
                break

    except Exception as e:
        print(f"Error searching for keyword {keyword}: {str(e)}")
        return []

    return links

In [15]:
import itertools

def combination_keywords(sets):
    """Tạo danh sách các chuỗi từ khóa từ một danh sách các bộ từ khóa."""
    if not sets:
        return []
    combinations = itertools.product(*sets)
    return [' AND '.join(combo) for combo in combinations]

def generate_all_combinations(t2sql, security, llm):
    """Tạo danh sách tất cả các tổ hợp từ khóa theo các trường hợp yêu cầu."""
    # Định nghĩa các trường hợp cần tạo tổ hợp
    cases = [
        [t2sql],                    # Chỉ t2sql
        [t2sql, security],          # t2sql + security
        [t2sql, llm],               # t2sql + llm
        [t2sql, security, llm]      # t2sql + security + llm
    ]
    
    # Tạo và hợp nhất tất cả các tổ hợp
    all_combinations = []
    for case in cases:
        all_combinations.extend(combination_keywords(case))
    
    return all_combinations

In [16]:
def crawl_acm():
    # Initialize empty lists to store all results
    all_links = []

    # option = ChromiumOptions()
    # # Uncomment the following lines to run in headless mode or with specific options
    # option.headless(on_off=True)  # Run in headless mode
    # # Initialize Chromium browser
    option = ChromiumOptions()
    browser = ChromiumPage()

    #Thêm ngoặc chính xác bộ keywords
    t2sql = ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"', '"natural language to sql"', 
             '"semantic parsing to sql"', '"nl to sql"']
    security = ['"security"', '"access control"', '"injection"', '"prompt injection"', '"defense"', '"attack"', '"vulnerability"']
    llm = ['"llm"', '"large language model"']

    keywords = generate_all_combinations(t2sql, security, llm)
    
    for keyword in keywords:
        
        print(f'{keyword} is proccessed......')

        keyword_1 = keyword.strip()
        keyword_1 = keyword_1.replace(" ", "+")
        links = crawl_link(keyword_1, browser)

            # Create DataFrame after collecting all data
        partly = pd.DataFrame()
        partly['link'] = links


        if not partly.empty:
            keyword = keyword.replace('"', '')
            keyword = keyword.replace(' ', '_')
            partly.drop_duplicates(subset=['link'], inplace=True)
            partly.to_csv(f'../raw_crawl_papers/acm/crawl_by_{keyword}.csv', index=False)

        all_links.extend(links)

    full = pd.DataFrame()
    
    full['link'] = all_links
    
    full.drop_duplicates(subset=['link'], inplace=True)
    full.to_csv('../raw_crawl_papers/acm/all_acm_papers.csv', index=False)

    # Close the browser
    browser.quit()
    print(f"Total papers collected: {len(full)}")

In [10]:
crawl_acm()

"text-to-sql" is proccessed......
Reached last page for keyword: "text-to-sql"
"nl2sql" is proccessed......
Reached last page for keyword: "nl2sql"
"t2sql" is proccessed......
No results found on page 1
"text2sql" is proccessed......
Reached last page for keyword: "text2sql"
"natural language to sql" is proccessed......
Reached last page for keyword: "natural+language+to+sql"
"semantic parsing to sql" is proccessed......
No results found on page 1
"nl to sql" is proccessed......
Reached last page for keyword: "nl+to+sql"
"text-to-sql" AND "security" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"security"
"text-to-sql" AND "access control" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"access+control"
"text-to-sql" AND "injection" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"injection"
"text-to-sql" AND "prompt injection" is proccessed......
Reached last page for keyword: "text-to-sql"+AND+"prompt+injection"
"text-t

In [17]:
def extract_detail(page: ChromiumPage, link: str):
    
    page.get(link)
    time.sleep(2)

   # Trích xuất PDF link
    # Select the <a> tag with class "btn btn--pdf red" using CSS selector
    pdf_element = page.ele("css:a.btn.btn--pdf.red") if page.ele("css:a.btn.btn--pdf.red") else None

    # Extract the href attribute (PDF link)
    pdf_link = pdf_element.attr("href") if pdf_element else None
    print(f"PDF Link: {pdf_link}")

    # Trích xuất tiêu đề
    title = page.ele("css:h1[property='name']").text if page.ele("css:h1[property='name']") else None
    print(f'Title:{title}')

    # Trích xuất danh sách tác giả
    author_spans = page.eles("css:span[property='author']") if page.eles("css:span[property='author']") else None
    authors = [author_span.ele("css:a").text for author_span in author_spans] if author_spans else None
    if authors:
        print(f"Authors: {', '.join(authors)}")
    else:
        print('Author: None')

    # Trích xuất tóm tắt
    abstract_paragraphs = page.ele("css:#abstracts").eles("css:div[role='paragraph']") if page.ele("css:#abstracts") else None
    abstract = " ".join([p.text for p in abstract_paragraphs]) if abstract_paragraphs else None
    print(f"Abstract: {abstract}")

    # Trích xuất ngày gửi
    submission_date = page.ele("css:span.core-date-published").text if page.ele("css:span.core-date-published") else None
    print(f"Submission Date: {submission_date}")

    # Trích xuất DOI link
    doi_link = page.ele("css:a[property='sameAs']").attr("href") if page.ele("css:a[property='sameAs']") else None
    print(f"DOI Link: {doi_link}")

    return title, authors, pdf_link, abstract, doi_link, submission_date

In [19]:
import pandas as pd
from DrissionPage import ChromiumPage

paper = pd.read_csv('../raw_crawl_papers/acm/all_acm_papers.csv')

titles = []
authors = []
pdf_links = []
abstracts = [] 
dois = []
submitted_dates = []

page = ChromiumPage()
for link in paper['link']:

    print(f"Processing link: {link}")

    title, author, pdf_link, abstract, doi, submitted_date = extract_detail(page, link)
    titles.append(title)
    authors.append(author)
    pdf_links.append(pdf_link)
    abstracts.append(abstract)
    dois.append(doi)
    submitted_dates.append(submitted_date)

# Create a DataFrame with the extracted details
paper['pdf_link'] = pdf_links
paper['title'] = titles
paper['authors'] = authors
paper['abstract'] = abstracts
paper['submitted'] = submitted_dates
paper['doi'] = dois
# Save the updated DataFrame to a new CSV file
paper.to_csv('../all_acm_papers.csv', index=False)    

Processing link: https://dl.acm.org/doi/10.1145/3708359.3712083
PDF Link: https://dl.acm.org/doi/pdf/10.1145/3708359.3712083
Title:Text-to-SQL Domain Adaptation via Human-LLM Collaborative Data Annotation
Authors: YuanTian, DanielLee, FeiWu, TungMai, KunQian, SiddharthaSahai, TianyiZhang, YunyaoLi
Abstract: Text-to-SQL models, which parse natural language (NL) questions to executable SQL queries, are increasingly adopted in real-world applications. However, deploying such models in the real world often requires adapting them to the highly specialized database schemas used in specific applications. We find that existing text-to-SQL models experience significant performance drops when applied to new schemas, primarily due to the lack of domain-specific data for fine-tuning. This data scarcity also limits the ability to effectively evaluate model performance in new domains. Continuously obtaining high-quality text-to-SQL data for evolving schemas is prohibitively expensive in real-world s

In [20]:
page.quit()