In [8]:
from DrissionPage import ChromiumPage
import pandas as pd
import time
import re
import random

In [9]:
def extract_article_info(keyword):

    url = f"https://www.mdpi.com/search?q={keyword}"

    # Tạo đối tượng ChromiumPage
    page = ChromiumPage()
    page.get(url)
    
    # Đợi trang tải hoàn tất
    time.sleep(random.randint(2,3))

    # Tìm tất cả các cụm bài báo
    article_blocks = page.eles("css:div.generic-item.article-item")
    
    articles_info = []
    
    while True:
        for block in article_blocks:

            # 1. Trích xuất liên kết bài báo và tiêu đề bài báo
            link_element = block.ele("css:a.title-link")
            link = link_element.attr('href') if link_element else None
            title = link_element.text.strip() if link_element else None
            if link and not link.startswith('http'):
                link = "https://www.mdpi.com" + link
            
            # 2. Trích xuất liên kết PDF
            pdf_link_element = block.ele("css:a.UD_Listings_ArticlePDF")
            pdf_link = pdf_link_element.attr('href') if pdf_link_element else None
            if pdf_link and not pdf_link.startswith('http'):
                pdf_link = "https://www.mdpi.com" + pdf_link
            
            # 3. Trích xuất tác giả
            authors_element = block.ele("css:div.authors")
            authors = authors_element.text if authors_element else None
            if authors:
                authors = authors.replace("by", "").strip()

            
            # 4. Trích xuất tóm tắt đầy đủ
            abstract_full_element = block.ele("css:div.abstract-full")
            abstract = abstract_full_element.text if abstract_full_element else None
            if abstract:
                # Loại bỏ phần "Full article" ở cuối
                abstract = re.sub(r'\s*Full article$', '', abstract).strip()


            # 5. Trích xuất ngày gửi bài
            submitted_element = block.ele("css:div.color-grey-dark")
            submitted_text = submitted_element.text if submitted_element else None
            submitted_date = None
            if submitted_text:
                # Tìm ngày có định dạng như "19 Mar 2025"
                match = re.search(r'\d{1,2} \w{3} \d{4}', submitted_text)
                if match:
                    submitted_date = match.group(0)
            

            
            # 6. Trích xuất DOI
            doi_element = block.ele("css:a[href^='https://doi.org']")
            doi = doi_element.attr('href') if doi_element else None

            
            # Lưu thông tin vào dictionary
            articles_info.append({
                "link": link,
                "pdf_link": pdf_link,
                "title": title,
                "authors": authors,
                "abstract": abstract,
                "submitted_date": submitted_date,
                "doi": doi
            })

        # Tìm liên kết chuyển trang
        next_page_link = page.ele("css:a[href*='page_no'] i.material-icons:contains('chevron_right')")
        if next_page_link:
            # Nhấp vào liên kết chuyển trang
            next_page_link.click()
        else:
            # Không còn liên kết chuyển trang, thoát vòng lặp
            break
    return articles_info

In [10]:
def save_to_csv(data: list, filename: str) -> None:
    """
    Save the extracted data to a CSV file.
    """
    try:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['link'], inplace=True)
        df.to_csv(f'mdpi/{filename}', index=False)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

In [11]:
import itertools

def combination_keywords(sets):
    """Tạo danh sách các chuỗi từ khóa từ một danh sách các bộ từ khóa."""
    if not sets:
        return []
    combinations = itertools.product(*sets)
    return [' AND '.join(combo) for combo in combinations]

def generate_all_combinations(t2sql, security, llm):
    """Tạo danh sách tất cả các tổ hợp từ khóa theo các trường hợp yêu cầu."""
    # Định nghĩa các trường hợp cần tạo tổ hợp
    cases = [
        [t2sql],                    # Chỉ t2sql
        [t2sql, security],          # t2sql + security
        [t2sql, llm],               # t2sql + llm
        [t2sql, security, llm]      # t2sql + security + llm
    ]
    
    # Tạo và hợp nhất tất cả các tổ hợp
    all_combinations = []
    for case in cases:
        all_combinations.extend(combination_keywords(case))
    
    return all_combinations

In [12]:
def crawl_mdpi():
    """
    Main function to run the pipeline.
    """
    # # Get keywords from user
    # keywords = input("Enter keywords to search for: ")
    # keywords = keywords.strip()
    # keywords = keywords.replace(" ", "+")

    #Thêm ngoặc chính xác bộ keywords
    t2sql = ['"text-to-sql"', '"nl2sql"', '"t2sql"', '"text2sql"', '"natural language to sql"', 
             '"semantic parsing to sql"', '"nl to sql"']
    security = ['"security"', '"access control"', '"injection"', '"prompt injection"', '"defense"', '"attack"', '"vulnerability"']
    llm = ['"llm"', '"large language model"']

    keywords = generate_all_combinations(t2sql, security, llm)
    tmp = []

    # Crawl papers
    for keyword in keywords:
        print(f"Searching for papers related to: {keyword}")
        keyword_1 = keyword.strip()
        keyword_1 = keyword_1.replace(" ", "+")
        papers_data = extract_article_info(keyword_1)
        if len(papers_data) > 0:
            keyword = keyword.replace('"', '')
            keyword = keyword.replace(' ', '_')
            save_to_csv(papers_data,f"crawl_by_{keyword}.csv")
        tmp.extend(papers_data)

    if tmp:
        # Save data to CSV
        filename = "all_mdpi_papers"
        save_to_csv(tmp, f"{filename}.csv")
        print(f"Extracted {len(tmp)} papers.")

In [13]:
crawl_mdpi()

Searching for papers related to: "text-to-sql"
Data saved to crawl_by_text-to-sql.csv
Searching for papers related to: "nl2sql"
Data saved to crawl_by_nl2sql.csv
Searching for papers related to: "t2sql"
Searching for papers related to: "text2sql"
Data saved to crawl_by_text2sql.csv
Searching for papers related to: "natural language to sql"
Data saved to crawl_by_natural_language_to_sql.csv
Searching for papers related to: "semantic parsing to sql"
Data saved to crawl_by_semantic_parsing_to_sql.csv
Searching for papers related to: "nl to sql"
Data saved to crawl_by_nl_to_sql.csv
Searching for papers related to: "text-to-sql" AND "security"
Data saved to crawl_by_text-to-sql_AND_security.csv
Searching for papers related to: "text-to-sql" AND "access control"
Searching for papers related to: "text-to-sql" AND "injection"
Data saved to crawl_by_text-to-sql_AND_injection.csv
Searching for papers related to: "text-to-sql" AND "prompt injection"
Searching for papers related to: "text-to-sql" 