In [1]:
import requests
import pandas as pd
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def search_crossref(award_id, pgm_name, rows=25):
    import requests
    base_url = "https://api.crossref.org/works"
    headers = {
        "User-Agent": "NSF-DOI-Fetcher (mailto:your_email@example.com)"
    }

    def filter_award_matches(items):
        return [
            item for item in items
            if any(award_id == a.get("award.number", "") for a in item.get("award", []))
        ]

    try:
      
        params = {"filter": f"award.number:{award_id}", "rows": rows}
        r1 = requests.get(base_url, params=params, headers=headers, timeout=8)
        r1.raise_for_status()
        filtered_items = filter_award_matches(r1.json()["message"]["items"])

        if filtered_items:
            matched_items = filtered_items
        else:

            query = f"NSF {award_id}"
            params = {"query": query, "rows": rows}
            r2 = requests.get(base_url, params=params, headers=headers, timeout=8)
            r2.raise_for_status()
            raw_items = r2.json()["message"]["items"]

          
            matched_items = []
            for item in raw_items:
                item_text = str(item).lower()
                if award_id in item_text:
                    matched_items.append(item)
                    
        if not matched_items:
            return []

        results = []
        for item in matched_items:
            doi = item.get("DOI", "")
            title = item.get("title", [""])[0]
            publisher = item.get("publisher", "")
            url = f"https://doi.org/{doi}" if doi else ""
            results.append({
                "award_id": award_id,
                "pgm_ele_0_pgm_ele_name": pgm_name,
                "title": title,
                "doi": doi,
                "url": url,
                "publisher": publisher
            })

        return results

    except Exception as e:
        return []
def fetch_all_papers_parallel(csv_path, max_workers=3):
    df_awards = pd.read_csv(csv_path, usecols=["awd_id", "pgm_ele_0_pgm_ele_name"])
    all_results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(
                search_crossref,
                str(row["awd_id"]),
                row.get("pgm_ele_0_pgm_ele_name", "")
            ): row["awd_id"]
            for _, row in df_awards.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="🔍 Fetching papers"):
            try:
                result = future.result()
                all_results.extend(result)
            except Exception:
                continue
    df_results = pd.DataFrame(all_results)
    before = len(df_results)
    df_results = df_results.drop_duplicates(subset=["doi"])
    after = len(df_results)
    print(f"✅ Removed {before - after} duplicate DOIs")
    return df_results

In [2]:
def accuracy(df,origin):

    df_awards = pd.read_csv(origin)
  
    original_award_ids = set(df_awards['awd_id'].astype(str))
    matched_award_ids = set(df['award_id'].astype(str))
    unmatched_award_ids = original_award_ids - matched_award_ids
    results = len(matched_award_ids )/ len(original_award_ids)

 
    print(f"Total awards in original file: {len(original_award_ids)}")
    print(f"Total awards matched in results: {len(matched_award_ids)}")
    print(f"Awards with no matched papers: {len(unmatched_award_ids)}")
    print(f"rate of found is {results}")
    print()


In [None]:
if __name__ == "__main__":
    start = time.time()
   
    file_name = #change names here 
     
    df = fetch_all_papers_parallel(file_name)
    df.to_csv()
        
    print(f"✅ Done in {time.time() - start:.2f} seconds. Total papers: {len(df)}")
    if df.empty:
        print(f"No DOIs found in {file_name}.")
    else:
        df.to_csv()
        accuracy(df, file_name)
