# Purpose
The purpose of this script is to use the previously extracted faculty info and retrieve all related publication info from dblp.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import pickle
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# Step 1: Retrieve pickle-ricked data
with open('top_5_list.pkl', 'rb') as f:
    top_5_list = pickle.load(f) # [[retrieve_mit_list()], [retrieve_stanford_list()], [retrieve_cmu_list()], [retrieve_nus_list()], [retrieve_ucb_list()]]

# Step 2: Create DF from previously made lists
assume that univesity school sub-department doesn't matter for now

In [3]:
# Combine existing list; assuming sub-fac doesn't matter for now
mit_list = top_5_list[0][0][0] + top_5_list[0][0][1] + top_5_list[0][0][2] 
stanford_list = top_5_list[1][0][0] + top_5_list[1][0][1]
cmu_list = top_5_list[2][0] # singular list
nus_list = top_5_list[3][0][0] + top_5_list[3][0][1] + top_5_list[3][0][2] + top_5_list[3][0][3]
ucb_list = top_5_list[4][0][0] + top_5_list[4][0][1] 

# Append university meta data for DF ease of use
mit_append_list = ['MIT'] * len(mit_list)
stanford_append_list = ['Stanford'] * len(stanford_list)
cmu_append_list = ['CMU'] * len(cmu_list)
nus_append_list = ['NUS'] * len(nus_list)
ucb_append_list = ['UCB'] * len(ucb_list)

qs5_df = pd.DataFrame()
qs5_df['University'] = mit_append_list + stanford_append_list + cmu_append_list + nus_append_list + ucb_append_list
qs5_df['Faculty'] = mit_list + stanford_list + cmu_list + nus_list + ucb_list

# [Skip to Step 7] for now

# Step 3: Search query list pre-processing

In [None]:
# Create empty list for search query automation
q_list = []

# Iterate over extracted faculty names, convert spaces to %20, then append to search query list
for each in qs5_df['Faculty']:
    query = str(each).replace(" ", "%20") # need to convert to string??
    q_list.append(query) #  this creates a list with 983 queries lmao

# Step 4: Use processed query list to retrieve dblp raw html and store into list

In [None]:
'''
url="https://dblp.org/search?q="

# Declare list to store extracted content
qs5_content_list = []

i = 0
# Iterate using q_list to make a GET request to fetch raw HTML content
for each in q_list:
    html_content = requests.get(url+each).text
    qs5_content_list.append(html_content)
    i+=1
    if (i % 10 == 0):
        print(i)
    
# Store content_list with pickle
with open('qs5_content_list.pkl', 'wb') as f:
    pickle.dump(qs5_content_list, f)
'''

In [None]:
# Retrieve content_list with pickle
with open('qs5_content_list.pkl', 'rb') as f:
    qs5_content_list = pickle.load(f)

# Step 5: Process each item in the qs5_content list w/ BS4 (983 times lmao)

In [None]:
# Declare empty list for storing soups
qs5_soup_list = []

i = 0
for each in qs5_content_list:
    soup = BeautifulSoup(each, "lxml")
    qs5_soup_list.append(soup.prettify())
    i+=1
    if (i % 100 == 0):
        print(i)

# Store pretty_soup_list with pickle
with open('qs5_soup_list.pkl', 'wb') as f:
    pickle.dump(qs5_soup_list, f)

# Step 6: PID Extraction Phase

In [None]:
"""
# Declare empty pid list for storing pids
qs5_pid_list = []

i = 0
# Iterate over pretty_soup_list to extract pid
for each in qs5_soup_list:
    converted_each = BeautifulSoup(each, "html.parser") # need to convert lmao
    pid = converted_each.find("a", itemprop="url")
    try:
        pid_list.append(pid['href']) # select href tag to get pid url
    except:
        pid_list.append('ERROR')
    i+=1
    if (i % 100 == 0):
        print(i)
        
# Store qs5_pid_list with pickle
with open('qs5_pid_list.pkl', 'wb') as f:
    pickle.dump(qs5_pid_list, f)
"""

# Step 7: Append PID values into DF + duplicate removal + manual re-search for inaccurate names + PID source (Automated/Manual) + removal of individuals w/out publications

In [4]:
# Retrieve qs5_pid_list with pickle
with open('qs5_pid_list.pkl', 'rb') as f:
    qs5_pid_list = pickle.load(f)   

In [5]:
# Append PID list to df column
qs5_df['PID'] = qs5_pid_list

In [6]:
# Discovered later: duplicate records. Step to remove duplicates.
qs5_df = qs5_df.astype(str).drop_duplicates() # Before pruning, len=983. After pruning, len=666 >:D
qs5_df = qs5_df.reset_index(drop=True) # reset index

In [7]:
# 70 records with non-existent dfs
# qs5_df['Faculty'].loc[qs5_df['PID'] == 'ERROR'].to_excel("Error_Manual.xlsx")
replacement_df = pd.read_excel("Error_Manual.xlsx")

In [8]:
# Create new column to indicate augmentation status
qs5_df['PID Source'] = 'Automated'

# Fill up manually-obtained PID info
for index, row in qs5_df.iterrows():
    # row[1] = qs5_df['Faculty'] 
    for index_2, row_2 in replacement_df.iterrows(): # sorry, I know this is terrible time-space complexity    
        if (row[1] == replacement_df['Faculty'][index_2]):
            # row[2] = qs5_df['PID']
            row[2] = replacement_df['PID'][index_2]
            # row[2] = qs5_df['PID Source']
            row[3] = 'Manual'
            
# Remove rows where PID data is unavailable, usually due to indiviudals having no publications
qs5_df = qs5_df[(qs5_df['PID'] != 'UNAVAILABLE')]

In [18]:
# Store qs5_df
with open('qs5_step7_df.pkl', 'wb') as f:
    pickle.dump(qs5_df, f)