In [49]:
import pandas as pd

def cro_par_merge(year, dic):
    cro = pd.read_csv(f"Cross_All/Crossref/{year}.csv")
    par = pd.read_csv(f"PAR/par{year}.csv")
    origin = pd.read_csv(f"awards/{year}.csv")
    
    origin['awd_id'] = origin['awd_id'].astype(str).str.strip()
    cro['award_id'] = cro['award_id'].astype(str).str.strip()
    par['award_id'] = par['award_id'].astype(str).str.strip()
    cro['title'] = cro['title'].astype(str).str.strip().str.lower()
    par['paper_title'] = par['paper_title'].astype(str).str.strip().str.lower()
    
    Cross = cro[['award_id', 'title','doi']].copy()
    Cross['found_by_cross'] = 'yes'
    Cross['found_by_par'] = ''
    
    Par = par[['award_id', 'paper_title','doi']].copy()
    Par = Par.rename(columns={'paper_title': 'title'})
    Par['found_by_cross'] = ''
    Par['found_by_par'] = 'yes'
    
    merged_titles = pd.merge(
        Cross,
        Par,
        on=['award_id', 'title'],
        how='outer',
        suffixes=('_c', '_par')
    )
    
    merged_titles['doi'] = merged_titles['doi_c'].combine_first(merged_titles['doi_par'])
    
    merged_titles['found_by_cross'] = (
        merged_titles['found_by_cross_c'].fillna('').replace('', pd.NA)
        .combine_first(merged_titles['found_by_cross_par'].fillna('').replace('', pd.NA))
    ).fillna('')
    
    merged_titles['found_by_par'] = (
        merged_titles['found_by_par_c'].fillna('').replace('', pd.NA)
        .combine_first(merged_titles['found_by_par_par'].fillna('').replace('', pd.NA))
    ).fillna('')
    
    results= merged_titles[['award_id', 'doi', 'title', 'found_by_cross', 'found_by_par']].rename(columns={'award_id': 'awd_id'})
    origin['awd_id'] = origin['awd_id'].astype(str)
    merged = pd.merge(origin, results, on='awd_id', how='left')
    
    cols = list(merged.columns)
    cols.insert(1, cols.pop(cols.index('title')))
    cols.insert(2, cols.pop(cols.index('doi')))
    cols.insert(3, cols.pop(cols.index('found_by_cross')))
    cols.insert(4, cols.pop(cols.index('found_by_par')))
    merged = merged[cols]
    
    award_doi_found = merged.dropna(subset=['doi']).copy()
    total_awards = merged['awd_id'].nunique()

    awards_found = award_doi_found['awd_id'].nunique()
    rate_awards_found = round(awards_found / total_awards, 4) if total_awards > 0 else 0

    total_unique_dois = award_doi_found['doi'].nunique()

    dois_found_by_cross = award_doi_found[award_doi_found['found_by_cross']=='yes']['doi'].nunique()
    rate_dois_found_by_cross = round(dois_found_by_cross / total_unique_dois, 4) if total_unique_dois > 0 else 0

    dois_found_by_par = award_doi_found[award_doi_found['found_by_par']=='yes']['doi'].nunique()
    rate_dois_found_by_par = round(dois_found_by_par / total_unique_dois, 4) if total_unique_dois > 0 else 0

    output_file = f"Merged/merge{year}.csv"
    merged.to_csv(output_file, index=False)
    dic[year] = {
        'awards_with_doi': rate_awards_found,
        'dois_found_by_cross': rate_dois_found_by_cross,
        'dois_found_by_par': rate_dois_found_by_par
    }


In [51]:
def found_rate(df):
    cro = df['found_by_cross'].dropna().astype(str).str.strip().ne('').sum()
    total = len(df)
    cro_rate = round(cro / total, 2)

    par = df['found_by_par'].dropna().astype(str).str.strip().ne('').sum()
    par_rate = round(par / total, 2)
    print(f" Row-level: Crossref rate = {cro_rate}, PAR rate = {par_rate}")
    return cro_rate,par_rate

def id_found_rate(df):
    df['awd_id'] = df['awd_id'].astype(str)
    unique_awards = df.groupby('awd_id').agg({
        'found_by_cross': lambda x: any(x.dropna().astype(str).str.strip() != ''),
        'found_by_par': lambda x: any(x.dropna().astype(str).str.strip() != '')
    }).reset_index()

    total = len(unique_awards)
    cro = unique_awards['found_by_cross'].sum()
    par = unique_awards['found_by_par'].sum()

    cro_rate = round(cro / total, 2)
    par_rate = round(par / total, 2)

    print(f"Award-level: Crossref rate = {cro_rate}, PAR rate = {par_rate}")
    return cro_rate, par_rate


In [53]:
summary = {}
for years in range(2017, 2025):
    try:
        cro_par_merge(years, summary)
    except Exception as e:
        print(f"Error processing year {years}: {e}")

summary_df = pd.DataFrame.from_dict(summary, orient='index').reset_index()
summary_df = summary_df.rename(columns={'index':'year'})
summary_df.to_csv("2017-2024.csv", index=False)
print("finished merging")

  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")
  origin = pd.read_csv(f"awards/{year}.csv")


finished merging
