In [28]:
from tqdm import tqdm 
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
def get_par_papers(award_id):
    url = f"https://par.nsf.gov/search/term:{award_id}"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        papers = []
        for article_div in soup.select("div.article.item.document"):
            # Title
            title_tag = article_div.select_one("div.title a[itemprop='url'] span[itemprop='name']")
            title = title_tag.get_text(strip=True) if title_tag else None

            # Raw paper link
            link_tag = article_div.select_one("div.title a[itemprop='url']")
            raw_link = link_tag["href"] if link_tag and "href" in link_tag.attrs else None

            # Extract just /biblio/xxxxx part
            match = re.search(r"/biblio/\d+", raw_link if raw_link else "")
            clean_link = "https://par.nsf.gov" + match.group(0) if match else None

            # DOI
            doi_tag = article_div.select_one("div.title + div a[href*='doi.org']")
            doi = doi_tag["href"] if doi_tag and "href" in doi_tag.attrs else None

            # Authors
            authors_tags = article_div.select("div.metadata span.authors span.author[itemprop='author']")
            authors = "; ".join([a.get_text(strip=True) for a in authors_tags]) if authors_tags else None

            # Date published
            date_tag = article_div.select_one("div.metadata span.year time[itemprop='datePublished']")
            date_published = date_tag.get_text(strip=True) if date_tag else None

            # Journal name
            journal = None
            year_span = article_div.select_one("div.metadata span.year")
            if year_span:
                text = year_span.get_text(separator=" ", strip=True)
                match = re.search(r"\((?:.*?)\,\s*(.*?)\)$", text)
                if match:
                    journal = match.group(1)

            papers.append((award_id, title, clean_link, doi, authors, date_published, journal))

        return papers if papers else []

    except Exception as e:
        print(f"Error for award {award_id}: {e}")
        return []


def par(file_path):
    df = pd.read_csv(file_path)
    award_ids = df["awd_id"].astype(str).tolist()

    all_papers = []
    total = len(award_ids)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(get_par_papers, aid): aid for aid in award_ids}
        for future in tqdm(as_completed(futures), total=total, desc="🔎 Scraping papers"):
            papers = future.result()
            if papers:
                all_papers.extend(papers)

    papers_df = pd.DataFrame(all_papers, columns=[
        "award_id", "paper_title", "paper_link", "doi", "authors",
        "date_published", "journal_name"
    ])

    # Drop rows with missing or empty paper_title
    papers_df = papers_df.dropna(subset=["paper_title"])
    papers_df = papers_df[papers_df["paper_title"].str.strip() != ""]

    return papers_df



In [24]:
def accuracy(df,origin):

    df_awards = pd.read_csv(origin)

   
    original_award_ids = set(df_awards['awd_id'].astype(str))
    matched_award_ids = set(df['award_id'].astype(str))
    unmatched_award_ids = original_award_ids - matched_award_ids
    results = len(matched_award_ids )/ len(original_award_ids)


    print(f"Total awards in original file: {len(original_award_ids)}")
    print(f"Total awards matched in results: {len(matched_award_ids)}")
    print(f"Awards with no matched papers: {len(unmatched_award_ids)}")
    print(f"rate of found is {results}")
    print()



In [11]:

if __name__ == "__main__":
    start = time.time()
    for year in range(2017,2025):
        print(f" Year: {year}")
        file_name = f"awards/{year}.csv" 

        df = par(file_name)
        df.to_csv(f"PAR/par{year}.csv", index=False)

        print(f"✅ Done in {time.time() - start:.2f} seconds. Total papers: {len(df)}")
        accuracy(df, file_name)


 Year: 2018


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 12684/12684 [43:29<00:00,  4.86it/s] 


✅ Done in 2618.71 seconds. Total papers: 50042


  df_awards = pd.read_csv(origin)


Total awards in original file: 12684
Total awards matched in results: 8481
Awards with no matched papers: 4203
rate of found is 0.6686376537369915

 Year: 2019


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 12180/12180 [22:33<00:00,  9.00it/s]


✅ Done in 3978.34 seconds. Total papers: 47284


  df_awards = pd.read_csv(origin)


Total awards in original file: 12180
Total awards matched in results: 8093
Awards with no matched papers: 4087
rate of found is 0.6644499178981937

 Year: 2020


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 13041/13041 [19:10<00:00, 11.34it/s]


✅ Done in 5134.22 seconds. Total papers: 46518


  df_awards = pd.read_csv(origin)


Total awards in original file: 13041
Total awards matched in results: 8844
Awards with no matched papers: 4197
rate of found is 0.6781688520818956

 Year: 2021


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 12161/12161 [17:31<00:00, 11.56it/s]


✅ Done in 6191.23 seconds. Total papers: 37861


  df_awards = pd.read_csv(origin)


Total awards in original file: 12161
Total awards matched in results: 7716
Awards with no matched papers: 4445
rate of found is 0.6344872954526766

 Year: 2022


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 11907/11907 [20:10<00:00,  9.84it/s] 


✅ Done in 7405.56 seconds. Total papers: 26363


  df_awards = pd.read_csv(origin)


Total awards in original file: 11907
Total awards matched in results: 6433
Awards with no matched papers: 5474
rate of found is 0.540270429159318

 Year: 2023


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 12022/12022 [15:45<00:00, 12.72it/s]


✅ Done in 8354.98 seconds. Total papers: 17193


  df_awards = pd.read_csv(origin)


Total awards in original file: 12022
Total awards matched in results: 5022
Awards with no matched papers: 7000
rate of found is 0.4177341540509067

 Year: 2024


  df = pd.read_csv(file_path)
🔎 Scraping papers: 100%|██████████| 11687/11687 [13:39<00:00, 14.26it/s]


✅ Done in 9177.48 seconds. Total papers: 5965
Total awards in original file: 11687
Total awards matched in results: 2247
Awards with no matched papers: 9440
rate of found is 0.19226490972875845



  df_awards = pd.read_csv(origin)
