In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import pickle

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
# Import faculty details into df
faculty_df = pd.read_excel('Faculty.xlsx')

# Select relevant columns
faculty_df = faculty_df[['Faculty', 'Position', 'Gender', 'Management', 'DBLP', 'Area']]

# Don't need to run this code for now

In [4]:
# Create empty list for storing modified DBLP link to access XML variant w/ their API
xml_list = []

# Iterate over faculty_df and replace .html w/ .xml - updated to append .xml for missing .html cases
for each in faculty_df['DBLP']:
    if '.html' in each:
        replaced_each = each.replace(".html", ".xml")
    xml_list.append(replaced_each)

In [5]:
# Declare list to store extracted content
content_list = []

i = 0
# Iterate using q_list to make a GET request to fetch raw HTML content
for each in xml_list:
    html_content = requests.get(each).text
    content_list.append(html_content)
    i+=1
    if (i % 10 == 0):
        print(i)
    
# Store content_list with pickle
with open('content_list.pkl', 'wb') as f:
    pickle.dump(content_list, f)    

10
20
30
40
50
60
70
80


In [6]:
# Retrieve content_list with pickle
with open('content_list.pkl', 'rb') as f:
    content_list = pickle.load(f)

In [7]:
# Declare empty list for storing soups
pretty_soup_list = []

for each in content_list:
    soup = BeautifulSoup(each, "lxml")
    pretty_soup_list.append(soup.prettify())

# Store pretty_soup_list with pickle
with open('pretty_soup_list.pkl', 'wb') as f:
    pickle.dump(pretty_soup_list, f)

# Run from here onwards

In [8]:
# Retrieve pretty_soup_list with pickle
with open('pretty_soup_list.pkl', 'rb') as f:
    pretty_soup_list = pickle.load(f)   

In [9]:
# Declare empty all_article_list
all_article_list = []
faculty_dblp_name_list = []

# Iterate over pretty_soup_list + extract names because given names in excel aren't same as DBLP lmao
for each in pretty_soup_list:
    converted_each = BeautifulSoup(each, "lxml") # need to convert lmao
    individual_article_list = converted_each.find_all('article')
    #individual_article_list += converted_each.find_all('inproceedings')
    all_article_list.append(individual_article_list)
    try:
        faculty_dblp_name = converted_each.dblpperson['name']
    except:
        faculty_dblp_name = converted_each.title.text.strip().strip('dblp: ') # omg cancerous code sorry
    finally:
        faculty_dblp_name_list.append(faculty_dblp_name)

In [10]:
'''
all_article_list[0] <- Faculty, List Containing Articles
all_article_list[0][0] <- Faculty, Individual Articles
'''

# Index for doing dict mapping later
faculty_index = 0

# declare empty lists for DF
article_key_list = []
article_mdate_list = []
faculty_index_list = []
title_list = []
#pages_list = []
year_list = []
#volume_list = []
#journal_list = []
authors_list = []

for each in all_article_list:
    for article in each:
        # Article Tag Extraction w/ Array Indexing
        article_key = article["key"]
        article_mdate = article["mdate"]
        # Strip processing 
        stripped_title = article.title.text.strip()
        #stripped_pages = article.pages.text.strip() <- apparently we have null pages somewhere?
        stripped_year = article.year.text.strip()
        #stripped_volume = article.volume.text.strip()
        #stripped_journal = article.journal.text.strip() 
        stripped_authors = [each.text.strip() for each in article.find_all('author')] # list comprehension; bad space and time complexity  
        # List appendage
        article_key_list.append(article_key)
        article_mdate_list.append(article_mdate)
        faculty_index_list.append(faculty_index)
        title_list.append(stripped_title)
        #pages_list.append(stripped_pages)
        year_list.append(stripped_year)
        #volume_list.append(stripped_volume)
        #journal_list.append(stripped_journal)
        authors_list.append(stripped_authors)
    faculty_index+=1

In [11]:
# Declare DF for DBLP
dblp_df = pd.DataFrame()

# Create dict mapping for Faculty, len is used as f_index.
faculty_dict_mapping = dict(zip(range(len(faculty_df['Faculty'])), faculty_df['Faculty'],))

# Create another dict mapping for actual DBLP names used for faculty members, len is used f_index
faculty_dblp_name_dict_mapping = dict(zip(range(len(faculty_df['Faculty'])), faculty_dblp_name_list,))

# Fill up dblp_DF
dblp_df['f_index'] = faculty_index_list
dblp_df['Faculty'] = dblp_df['f_index'].map(faculty_dict_mapping)
#dblp_df['DBLP Name'] = dblp_df['f_index'].map(faculty_dblp_name_dict_mapping)
dblp_df['key'] = article_key_list
dblp_df['mdate'] = article_mdate_list
dblp_df['Title'] = title_list
dblp_df['Year'] = year_list
#dblp_df['Volume'] = volume_list
#dblp_df['Journal'] = journal_list
#dblp_df['Other Authors'] = authors_list
dblp_df['Full Authors List'] = authors_list

'''
# Code for inserting contribution into DF
contribution_index_list = []
for i, row in dblp_df.iterrows():
    if (row['DBLP Name'] in row['Full Authors List']): # check if DBLP name exists in Full Authors List
        ci = row['Full Authors List'].index(row['DBLP Name'])+1 # if so, retrieve index, +1 (to acccount for 0), then append to contribution_index_list
    elif (row['Faculty'] in row['Full Authors List']): # check if Faculty name exists in Full Authors List
        ci = row['Full Authors List'].index(row['Faculty'])+1
    else:
        ci = '-'
    contribution_index_list.append(ci)
dblp_df['Author Contribution Index'] = contribution_index_list # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)

# Use DBLP Name Column to prune duplicate author info from 'Other Authors'colmumn row-by-row (because DBLP name is not same as Faculty name lmao)
for i, row in dblp_df.iterrows():
    if (row['DBLP Name'] in row['Other Authors']): # check if DBLP faculty name exists in Other Authors.
        row['Other Authors'].remove(row['DBLP Name']) # if so, remove from Other Authors
    elif (row['Faculty'] in row['Other Authors']): # then check if faculty name exists in Other Authors
        row['Other Authors'].remove(row['Faculty']) # if so, remove from Other Authors
'''

"\n# Code for inserting contribution into DF\ncontribution_index_list = []\nfor i, row in dblp_df.iterrows():\n    if (row['DBLP Name'] in row['Full Authors List']): # check if DBLP name exists in Full Authors List\n        ci = row['Full Authors List'].index(row['DBLP Name'])+1 # if so, retrieve index, +1 (to acccount for 0), then append to contribution_index_list\n    elif (row['Faculty'] in row['Full Authors List']): # check if Faculty name exists in Full Authors List\n        ci = row['Full Authors List'].index(row['Faculty'])+1\n    else:\n        ci = '-'\n    contribution_index_list.append(ci)\ndblp_df['Author Contribution Index'] = contribution_index_list # assigns a value based on how much contribution the author has made for a publication. 1 = Highest (Main)\n\n# Use DBLP Name Column to prune duplicate author info from 'Other Authors'colmumn row-by-row (because DBLP name is not same as Faculty name lmao)\nfor i, row in dblp_df.iterrows():\n    if (row['DBLP Name'] in row['Oth

In [12]:
dblp_df.loc[dblp_df['Author Contribution Index'] == '-']

KeyError: 'Author Contribution Index'

In [13]:
dblp_df.head()

Unnamed: 0,f_index,Faculty,key,mdate,Title,Year,Full Authors List
0,0,A S Madhukumar,journals/cssp/MathewSVM20,2020-10-20,An Adaptive Energy Detection Scheme with Real-...,2020,"[Libin K. Mathew, Shanker Shreejith, A. Prasad..."
1,0,A S Madhukumar,journals/ijscn/SiriginaMB20,2020-08-12,Analysis of heterogeneous satellite networks w...,2020,"[Rajendra Prasad Sirigina, A. S. Madhukumar, M..."
2,0,A S Madhukumar,journals/ijscn/SiriginaMB20a,2020-08-12,Terrestrial Relay-Aided Cooperative High Throu...,2020,"[Rajendra Prasad Sirigina, A. S. Madhukumar, M..."
3,0,A S Madhukumar,journals/taes/RamabadranMW20,2020-05-04,Blind Estimation of Code Parameters for Produc...,2020,"[Swaminathan Ramabadran, A. S. Madhukumar, Guo..."
4,0,A S Madhukumar,journals/tgcn/RaoMS20,2020-06-18,Wireless Energy Harvesting-Based Relaying: A F...,2020,"[Yepuri Sudhakara Rao, A. S. Madhukumar, Rajen..."


In [14]:
def find_name_form_in_list(fac_name, authors_list):
    mod_fac_name = fac_name.replace("-", " ")
    mod_fac_name_list = mod_fac_name.split(" ")
    #print(mod_fac_name_list)
    best_i = 0
    best_count = 0
    
    for i in range(len(authors_list)):
        #check matchability
        author = authors_list[i]
        count = 0
        for w in mod_fac_name_list:
            if w in author:
                count += 1
        if count > best_count:
            best_count = count 
            best_i = i
    return authors_list[best_i]

find_name_form_in_list(dblp_df.iloc[0, 1], dblp_df.iloc[0,6])

['A', 'S', 'Madhukumar']


'A. S. Madhukumar'

In [16]:
dblp_df["published_name"] = dblp_df.apply(lambda row: find_name_form_in_list(row["Faculty"], row["Full Authors List"]), axis = 1)

['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']
['A', 'S', 'Madhukumar']


['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
['Dusit', 'Tao', 'Niyato']
[

['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'Kei', 'Lam']
['Siew', 'K

['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']
['Hock', 'Soon', 'Seah']


['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Jianmin']
['Zheng', 'Ji

In [None]:
dblp_df.to_csv(r'dblp_df.csv', index = False)

In [23]:
dblp_df["published_name"]

0                      A. S. Madhukumar
1                      A. S. Madhukumar
2                      A. S. Madhukumar
3                      A. S. Madhukumar
4                      A. S. Madhukumar
5                      A. S. Madhukumar
6                      A. S. Madhukumar
7                      A. S. Madhukumar
8                      A. S. Madhukumar
9                      A. S. Madhukumar
10                     A. S. Madhukumar
11                     A. S. Madhukumar
12                     A. S. Madhukumar
13                     A. S. Madhukumar
14                     A. S. Madhukumar
15                     A. S. Madhukumar
16                     A. S. Madhukumar
17                     A. S. Madhukumar
18                     A. S. Madhukumar
19                     A. S. Madhukumar
20                     A. S. Madhukumar
21                     A. S. Madhukumar
22                     A. S. Madhukumar
23                     A. S. Madhukumar
24                     A. S. Madhukumar


In [34]:
dblp_df

Unnamed: 0,f_index,Faculty,key,mdate,Title,Year,Full Authors List,published_name
0,0,A S Madhukumar,journals/cssp/MathewSVM20,2020-10-20,An Adaptive Energy Detection Scheme with Real-...,2020,"[Libin K. Mathew, Shanker Shreejith, A. Prasad...",A. S. Madhukumar
1,0,A S Madhukumar,journals/ijscn/SiriginaMB20,2020-08-12,Analysis of heterogeneous satellite networks w...,2020,"[Rajendra Prasad Sirigina, A. S. Madhukumar, M...",A. S. Madhukumar
2,0,A S Madhukumar,journals/ijscn/SiriginaMB20a,2020-08-12,Terrestrial Relay-Aided Cooperative High Throu...,2020,"[Rajendra Prasad Sirigina, A. S. Madhukumar, M...",A. S. Madhukumar
3,0,A S Madhukumar,journals/taes/RamabadranMW20,2020-05-04,Blind Estimation of Code Parameters for Produc...,2020,"[Swaminathan Ramabadran, A. S. Madhukumar, Guo...",A. S. Madhukumar
4,0,A S Madhukumar,journals/tgcn/RaoMS20,2020-06-18,Wireless Energy Harvesting-Based Relaying: A F...,2020,"[Yepuri Sudhakara Rao, A. S. Madhukumar, Rajen...",A. S. Madhukumar
5,0,A S Madhukumar,journals/tsp/TanMSK20,2020-08-06,NOMA-Aided UAV Communications over Correlated ...,2020,"[Zheng Hui Ernest Tan, A. S. Madhukumar, Rajen...",A. S. Madhukumar
6,0,A S Madhukumar,journals/tvlsi/MathewSVM20,2020-09-19,A Power-Efficient Spectrum-Sensing Scheme Usin...,2020,"[Libin K. Mathew, Shanker Shreejith, A. Prasad...",A. S. Madhukumar
7,0,A S Madhukumar,journals/vcomm/TanMSK20,2020-08-06,Addressing spectrum efficiency through hybrid-...,2020,"[Zheng Hui Ernest Tan, A. S. Madhukumar, Rajen...",A. S. Madhukumar
8,0,A S Madhukumar,journals/access/TanMSK19,2020-03-27,A Power Series Approach for Hybrid-Duplex UAV ...,2019,"[Zheng Hui Ernest Tan, A. S. Madhukumar, Rajen...",A. S. Madhukumar
9,0,A S Madhukumar,journals/access/SharmaMR19,2020-06-15,Effect of Pointing Errors on the Performance o...,2019,"[Shubha Sharma, A. S. Madhukumar, Swaminathan ...",A. S. Madhukumar


In [30]:
dblp_df.iloc[3055,:]

f_index                                                             41
Faculty                                                       Liu Yang
key                                       journals/corr/abs-1809-07689
mdate                                                       2018-10-05
Title                Response Time Bounds for Typed DAG Parallel Ta...
Year                                                              2018
Full Authors List    [Meiling Han, Nan Guan, Jinghao Sun, Qingqiang...
published_name                                             Weichen Liu
Name: 3055, dtype: object