In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import pickle
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Step 1: Loading Provided Faculty Details

In [3]:
def load_faculty():
    # Import faculty details into df
    faculty_df = pd.read_excel('Faculty.xlsx')
    # Select relevant columns
    faculty_df = faculty_df[['Faculty', 'Position', 'Gender', 'Management', 'DBLP', 'Area']]
    return faculty_df

# Step 2: Scraping DBLP HTML

In [4]:
def scrape_dblp(faculty_df):
    # Create empty list for storing modified DBLP link to access XML variant w/ their API
    xml_list = []
    # Iterate over faculty_df and replace .html w/ .xml - updated to append .xml for missing .html cases
    for each in faculty_df['DBLP']:
        if '.html' in each:
            replaced_each = each.replace(".html", ".xml")
        xml_list.append(replaced_each)

    # Declare list to store extracted content
    content_list = []

    i = 0
    # Iterate using q_list to make a GET request to fetch raw HTML content
    for each in xml_list:
        html_content = requests.get(each).text
        content_list.append(html_content)
        i+=1
        if (i % 10 == 0):
            print(i)
            
    # Store content_list with pickle
    with open('content_list.pkl', 'wb') as f:
        pickle.dump(content_list, f)
    
    return content_list

# Step 3: Parsing DBLP HTML w/ BS4

In [None]:
def parse_dblp(content_list):

    # Declare empty list for storing soups
    pretty_soup_list = []

    for each in content_list:
        soup = BeautifulSoup(each, "lxml")
        pretty_soup_list.append(soup.prettify())

    # Store pretty_soup_list with pickle
    with open('pretty_soup_list.pkl', 'wb') as f:
        pickle.dump(pretty_soup_list, f)
    
    return pretty_soup_list

In [None]:
def parse_articles_conf(pretty_soup_list):
    # Declare empty all_article_list
    all_article_list = []
    faculty_dblp_name_list = []

    # Iterate over pretty_soup_list + extract names because given names in excel aren't same as DBLP lmao
    for each in pretty_soup_list:
        converted_each = BeautifulSoup(each, "lxml") # need to convert lmao
        individual_article_list = converted_each.find_all('article')
        individual_article_list += converted_each.find_all('inproceedings')
        all_article_list.append(individual_article_list)
        try:
            faculty_dblp_name = converted_each.dblpperson['name']
        except:
            faculty_dblp_name = converted_each.title.text.strip().strip('dblp: ') # omg cancerous code sorry
        finally:
            #print(faculty_dblp_name)
            faculty_dblp_name_list.append(faculty_dblp_name)
            
    return all_article_list, faculty_dblp_name_list

# Step 4: Dataframe Creation & Population w/ Parsed DBLP Data

In [None]:
# Declare DF for DBLP
COLUMN_NAMES=[
    'f_index',
    'Faculty',
    'key',
    'Year',
    'Full Authors List'
]
dblp_df = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
'''
all_article_list[0] <- Faculty, List Containing Articles
all_article_list[0][0] <- Faculty, Individual Articles
'''

def df_population(dblp_df):
    # Index for doing dict mapping later
    faculty_index = 0

    # declare empty lists for DF
    article_key_list = []
    article_mdate_list = []
    faculty_index_list = []
    title_list = []
    year_list = []
    authors_list = []

    for each in all_article_list:
        for article in each:
            # Article Tag Extraction w/ Array Indexing
            article_key = article["key"]
            article_mdate = article["mdate"]
            # Strip processing 
            stripped_year = article.year.text.strip()
            stripped_authors = [each.text.strip() for each in article.find_all('author')] # list comprehension; bad space and time complexity  
            # Append to df
            append_dict = {'f_index': faculty_index, 'Faculty': '', 'key': article_key, 'Year': stripped_year, 'Full Authors List': stripped_authors}
            dblp_df = dblp_df.append(append_dict, ignore_index=True)

        faculty_index+=1

    # Create dict mapping for Faculty, len is used as f_index.
    faculty_dict_mapping = dict(zip(range(len(faculty_df['Faculty'])), faculty_df['Faculty'],))
    dblp_df['Faculty'] = dblp_df['f_index'].map(faculty_dict_mapping)

    # Store dblp_df with pickle
    with open('dblp_12k_df.pkl', 'wb') as f:
        pickle.dump(dblp_df, f)
    return dblp_df

# Run from here onwards

# Step 5: Dataframe Post-Processing (NLP & Metrics)

In [5]:
# Retrieve dblp_df with pickle
with open('dblp_12k_df.pkl', 'rb') as f:
    dblp_df = pickle.load(f)  

In [6]:
def find_name_form_in_list(fac_name, authors_list):
    mod_fac_name = fac_name.replace("-", " ")
    mod_fac_name_list = mod_fac_name.split(" ")
    #print(mod_fac_name_list)
    best_i = 0
    best_count = 0
    
    for i in range(len(authors_list)):
        #check matchability
        author = authors_list[i]
        count = 0
        for w in mod_fac_name_list:
            if w in author:
                count += 1
        if count > best_count:
            best_count = count 
            best_i = i
    return authors_list[best_i]

find_name_form_in_list(dblp_df.iloc[0, 1], dblp_df.iloc[0,4])

'A. S. Madhukumar'

In [7]:
dblp_df["published_name"] = dblp_df.apply(lambda row: find_name_form_in_list(row["Faculty"], row["Full Authors List"]), axis = 1)

In [9]:
def ci_calculation(dblp_df):
    # Code for inserting contribution into DF
    contribution_index_list = []
    for i, row in dblp_df.iterrows():
        if (row['published_name'] in row['Full Authors List']): # check if DBLP name exists in Full Authors List
            ci = row['Full Authors List'].index(row['published_name'])+1 # if so, retrieve index, +1 (to acccount for 0), then append to contribution_index_list
        else:
            ci = '-'
        contribution_index_list.append(ci)
    dblp_df['Author Contribution Index'] = contribution_index_list # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)
    return dblp_df

In [None]:
dblp_df.tail(2)

In [None]:
# Use DBLP Name Column to prune duplicate author info from 'Other Authors'colmumn row-by-row (because DBLP name is not same as Faculty name lmao)
authors_list = dblp_df['Full Authors List'].tolist()
#dblp_df['Other Authors'] = authors_list.copy()
#dblp_df['Other Authors'] = others_list # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)

In [None]:
dblp_df.tail(2)

In [15]:
def prune_name_from_other_authors_list(fac_name, authors_list):
    mod_fac_name = fac_name.replace("-", " ")
    mod_fac_name_list = mod_fac_name.split(" ")
    #print(mod_fac_name_list)
    best_i = 0
    best_count = 0
    
    for i in range(len(authors_list)):
        #check matchability
        author = authors_list[i]
        count = 0
        for w in mod_fac_name_list:
            if w in author:
                count += 1
        if count > best_count:
            best_count = count 
            best_i = i
    authors_list.remove(authors_list[best_i])
    dblp_df['Other Authors'] = dblp_df['Full Authors List'].copy() # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)
    dblp_df.apply(lambda row: prune_name_from_other_authors_list(row["Faculty"], row["Other Authors"]), axis = 1)
    return dblp_df

# Use DBLP Name Column to prune duplicate author info from 'Other Authors'colmumn row-by-row (because DBLP name is not same as Faculty name lmao)

print('')




# Step 6: Dataframe to CSV Output

In [27]:
dblp_df.to_csv(r'dblp_df_2.csv', index = False)

In [None]:
dblp_df