In [None]:
import pandas as pd
from tqdm import tqdm

# Rate the paper

In [None]:
gs_df = pd.read_csv('./data/gs_results.csv')
gs_df['title'] = gs_df['title'].str.casefold()
pc_df = pd.read_csv('./data/pc_results.csv')
pc_df['title'] = pc_df['title'].str.casefold()
ss_df = pd.read_csv('./data/ss_results.csv')
ss_df['title'] = ss_df['title'].str.casefold()

In [None]:
gs_df.head()

In [None]:
pc_df.head()

In [None]:
ss_df.head()

In [None]:
TIER1 = 40
TIER2 = 100
TIER3 = 200
gs_rate_df = gs_df[['area','rank','title']][gs_df['rank']<=200].copy()
pc_rate_df = pc_df.copy()
ss_rate_df = ss_df[['area','rank','title']].copy()
gs_rate_df['rate'] = gs_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3 if x<=TIER3 else 2)
pc_rate_df['rate'] = pc_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3)
ss_rate_df['rate'] = ss_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3)

rate_df = gs_rate_df.merge(pc_rate_df, on=['area', 'title'], how='outer', suffixes=('_gs', '_pc'))
rate_df = rate_df.merge(ss_rate_df, on=['area', 'title'], how='outer')
rate_df.rename(columns={'rank': 'rank_ss', 'rate': 'rate_ss'}, inplace=True)
rate_df['rate_ss'].fillna(1, inplace=True)
rate_df['rank_ss'].fillna(201, inplace=True)
rate_df['rate_pc'].fillna(1, inplace=True)
rate_df['rank_pc'].fillna(201, inplace=True)
rate_df['rate_gs'].fillna(1, inplace=True)
rate_df['rank_gs'].fillna(401, inplace=True)

rate_df['agg_rate'] = (rate_df['rate_gs']*0.3 + rate_df['rate_pc']*0.4 + rate_df['rate_ss']*0.3)
rate_df['agg_rank'] = (rate_df['rank_gs']*0.3 + rate_df['rank_pc']*0.4 + rate_df['rank_ss']*0.3)
rate_df = rate_df.sort_values(by=['area', 'agg_rate', 'agg_rank'], ascending=[True, False, True]).reset_index(drop=True)
rate_df['rank'] = rate_df.groupby('area').cumcount()+1

In [None]:
rate_df.head()

# Get the total dataframe that contains download information and the pdf link

In [None]:
import requests

In [None]:
total_df = rate_df.merge(gs_df[['area', 'title', 'pdf_link']], on=['area', 'title'], how='left')
total_df = total_df.merge(ss_df[['area', 'title', 'abstract', 'citationCount', 'referenceCount', 'influentialCitationCount', 'fieldsOfStudy']], on=['area', 'title'], how='left')

In [None]:
total_df['pdf_link'].fillna('', inplace=True)
total_df['abstract'].fillna('', inplace=True)

In [None]:
for _ , paper in tqdm(total_df.iterrows()):
    if paper['abstract'] == '':
        title_formatted = paper['title'].replace(' ', '+')
        data = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/search?query={title_formatted}&limit=1&fields=abstract,referenceCount,citationCount,influentialCitationCount').json().get('data')
        if data:
            data = data[0]
            paper['abstract'] = data.get('abstract')
            paper['citationCount'] = data.get('citationCount')
            paper['referenceCount'] = data.get('referenceCount')
            paper['influentialCitationCount'] = data.get('influentialCitationCount')
        

In [None]:
paper

In [None]:
# pick the first 100 for the label test set
rate_test_df = rate_df[rate_df['new_rank'] <= 100].copy()

In [None]:
rate_test_df.groupby('area').count()

In [None]:
rate_test_df.to_csv('./data/rate_test_analysis.csv', index=False)