In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import pickle

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
# Import faculty details into df
faculty_df = pd.read_excel('Faculty.xlsx')

# Select relevant columns
faculty_df = faculty_df[['Faculty', 'Position', 'Gender', 'Management', 'DBLP', 'Area']]

# Don't need to run this code for now

In [None]:
# Create empty list for storing modified DBLP link to access XML variant w/ their API
xml_list = []

# Iterate over faculty_df and replace .html w/ .xml
for each in faculty_df['DBLP']:
    replaced_each = each.replace(".html", ".xml")
    xml_list.append(replaced_each)

In [None]:
# Declare list to store extracted content
content_list = []

i = 0
# Iterate using q_list to make a GET request to fetch raw HTML content
for each in xml_list:
    html_content = requests.get(each).text
    content_list.append(html_content)
    i+=1
    if (i % 10 == 0):
        print(i)
    
# Store content_list with pickle
with open('content_list.pkl', 'wb') as f:
    pickle.dump(content_list, f)    

In [None]:
# Retrieve content_list with pickle
with open('content_list.pkl', 'rb') as f:
    content_list = pickle.load(f)

In [None]:
# Declare empty list for storing soups
pretty_soup_list = []

for each in content_list:
    soup = BeautifulSoup(each, "lxml")
    pretty_soup_list.append(soup.prettify())

# Store pretty_soup_list with pickle
with open('pretty_soup_list.pkl', 'wb') as f:
    pickle.dump(pretty_soup_list, f)

# Run from here onwards

In [4]:
# Retrieve pretty_soup_list with pickle
with open('pretty_soup_list.pkl', 'rb') as f:
    pretty_soup_list = pickle.load(f)   

In [5]:
# Declare empty all_article_list
all_article_list = []
faculty_dblp_name_list = []

# Iterate over pretty_soup_list + extract names because given names in excel aren't same as DBLP lmao
for each in pretty_soup_list:
    converted_each = BeautifulSoup(each, "lxml") # need to convert lmao
    individual_article_list = converted_each.find_all('article')
    all_article_list.append(individual_article_list)
    try:
        faculty_dblp_name = converted_each.dblpperson['name']
    except:
        faculty_dblp_name = converted_each.title.text.strip().strip('dblp: ') # omg cancerous code sorry
    finally:
        faculty_dblp_name_list.append(faculty_dblp_name)

In [6]:
'''
all_article_list[0] <- Faculty, List Containing Articles
all_article_list[0][0] <- Faculty, Individual Articles
'''

# Index for doing dict mapping later
faculty_index = 0

# declare empty lists for DF
article_key_list = []
article_mdate_list = []
faculty_index_list = []
title_list = []
#pages_list = []
year_list = []
volume_list = []
journal_list = []
authors_list = []

for each in all_article_list:
    for article in each:
        # Article Tag Extraction w/ Array Indexing
        article_key = article["key"]
        article_mdate = article["mdate"]
        # Strip processing 
        stripped_title = article.title.text.strip()
        #stripped_pages = article.pages.text.strip() <- apparently we have null pages somewhere?
        stripped_year = article.year.text.strip()
        stripped_volume = article.volume.text.strip()
        stripped_journal = article.journal.text.strip() 
        stripped_authors = [each.text.strip() for each in article.find_all('author')] # list comprehension; bad space and time complexity  
        # List appendage
        article_key_list.append(article_key)
        article_mdate_list.append(article_mdate)
        faculty_index_list.append(faculty_index)
        title_list.append(stripped_title)
        #pages_list.append(stripped_pages)
        year_list.append(stripped_year)
        volume_list.append(stripped_volume)
        journal_list.append(stripped_journal)
        authors_list.append(stripped_authors)
    faculty_index+=1

In [7]:
# Declare DF for DBLP
dblp_df = pd.DataFrame()

# Create dict mapping for Faculty, len is used as f_index.
faculty_dict_mapping = dict(zip(range(len(faculty_df['Faculty'])), faculty_df['Faculty'],))

# Create another dict mapping for actual DBLP names used for faculty members, len is used f_index
faculty_dblp_name_dict_mapping = dict(zip(range(len(faculty_df['Faculty'])), faculty_dblp_name_list,))

# Fill up dblp_DF
dblp_df['f_index'] = faculty_index_list
dblp_df['Faculty'] = dblp_df['f_index'].map(faculty_dict_mapping)
dblp_df['DBLP Name'] = dblp_df['f_index'].map(faculty_dblp_name_dict_mapping)
dblp_df['key'] = article_key_list
dblp_df['mdate'] = article_mdate_list
dblp_df['Title'] = title_list
dblp_df['Year'] = year_list
#dblp_df['Volume'] = volume_list
dblp_df['Journal'] = journal_list
dblp_df['Other Authors'] = authors_list
dblp_df['Full Authors List'] = authors_list

# Code for inserting contribution into DF
contribution_index_list = []
for i, row in dblp_df.iterrows():
    if (row['DBLP Name'] in row['Full Authors List']): # check if DBLP name exists in Full Authors List
        ci = row['Full Authors List'].index(row['DBLP Name'])+1 # if so, retrieve index, +1 (to acccount for 0), then append to contribution_index_list
    elif (row['Faculty'] in row['Full Authors List']): # check if Faculty name exists in Full Authors List
        ci = row['Full Authors List'].index(row['Faculty'])+1
    else:
        ci = '-'
    contribution_index_list.append(ci)
dblp_df['Author Contribution Index'] = contribution_index_list # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)

# Use DBLP Name Column to prune duplicate author info from 'Other Authors'colmumn row-by-row (because DBLP name is not same as Faculty name lmao)
for i, row in dblp_df.iterrows():
    if (row['DBLP Name'] in row['Other Authors']): # check if DBLP faculty name exists in Other Authors.
        row['Other Authors'].remove(row['DBLP Name']) # if so, remove from Other Authors
    elif (row['Faculty'] in row['Other Authors']): # then check if faculty name exists in Other Authors
        row['Other Authors'].remove(row['Faculty']) # if so, remove from Other Authors

In [14]:
dblp_df.head(1)

Unnamed: 0,f_index,Faculty,DBLP Name,key,mdate,Title,Year,Journal,Other Authors,Full Authors List,Author Contribution Index
0,0,A S Madhukumar,A. S. Madhukumar,journals/cssp/MathewSVM20,2020-10-20,An Adaptive Energy Detection Scheme with Real-...,2020,Circuits Syst. Signal Process.,"[Libin K. Mathew, Shanker Shreejith, A. Prasad...","[Libin K. Mathew, Shanker Shreejith, A. Prasad...",4


In [10]:
# dblp_df.loc[dblp_df['Author Contribution Index'] == '-']

In [11]:
dblp_df.to_csv(r'dblp_df.csv', index = False)