In [57]:
import pandas as pd
import lucem_illud
from tqdm import tqdm
tqdm.pandas()

In [42]:
# load the csv file to a dataframe
file_path = 'database/pub_info_2016_2022.csv'
df = pd.read_csv(file_path, index_col=0)

In [43]:
# rename columns
df.rename(columns={'Title': 'title', 'Year': 'award_year', 'Cited by': 'total_citation', 'Paper URL': 'url', 'Authors': 'authors', 'Publication Date': 'publication_date', 'Journal': 'journal', 'Abstract': 'abstract', 'Citations': 'yearly_citation'}, inplace=True)

In [44]:
# reorder columns
df = df[['first_name', 'middle_name', 'last_name', 'email', 'title', 'journal', 'publication_date', 'authors', 'abstract', 'total_citation', 'yearly_citation', 'award_year', 'url']]

In [45]:
# clean 'journal' column

# drop rows where 'journal' is 'journal not found'
#df = df[df['journal'] != 'journal not found']

In [47]:
# clean 'publication_date' column

# option 1: drop rows where 'publication_date' is 'date not found'
#df = df[df['publication_date'] != 'date not found']

# option 2: replace 'date not found' in 'publication_date' with 'year/1/1'
def replace_date(row):
    if row['publication_date'] == 'date not found':
        return f"{row['year']}/1/1"
    else:
        return row['publication_date']

# apply the function
df['publication_date'] = df.apply(replace_date, axis=1)

In [None]:
# ensure 'publication_date' column are strings
#df['publication_date'] = df['publication_date'].astype(str)

# option 1: convert 'year' format to 'year/1/1' format
#df['publication_date'] = df['publication_date'].apply(lambda x: f"{x}/1/1" if x.isdigit() else x)

# option 2: drop rows where 'publication_date' only contains year
#df = df[~df['publication_date'].str.isnumeric()]

In [48]:
# clean 'authors' column

# convert each row in 'authors' to a list of authors
df['authors'] = df['authors'].str.split(', ')

In [49]:
# clean 'total_citation' column

# option 1: replace missing values in 'total_citation' with 0
#df['total_citation'] = df['total_citation'].fillna(0) 

# option 2: drop rows where 'total_citation' is missing
#df = df.dropna(subset=['total_citation'])

In [51]:
# clean 'yearly_citation' column

# option 1: drop rows where 'yearly_citation' column contains an empty dictionary
#df = df[df['yearly_citation'].apply(lambda x: x != {})]

# option 2: update rows with an empty dictionary in 'yearly_citation' using 'award_year' as key and 'total_citation' as value
#df['yearly_citation'] = df.apply(lambda row: {row['award_year']: row['total_citation']} if row['yearly_citation'] == {} else row['yearly_citation'], axis=1)

In [53]:
# clean 'abstract' column

# drop rows where 'abstract' is 'abstract not found' 
df = df[df['abstract'] != 'abstract not found']

# drop rows where 'abstract' has fewer than 20 words 
df = df[df['abstract'].astype(str).str.split().apply(len) >= 20]

In [54]:
# convert each author name to lowercase
df['authors'] = df['authors'].apply(lambda x: [name.lower() for name in x])

# convert each journal name to lowercase
df['journal'] = df['journal'].str.lower()

In [None]:
# tokenize 'title' column
df['tokenized_title'] = df['title'].progress_apply(lambda x: [lucem_illud.word_tokenize(s) for s in lucem_illud.sent_tokenize(x)])

In [60]:
# normalize 'tokenized_title' column
df['normalized_title'] = df['tokenized_title'].apply(lambda x: [lucem_illud.normalizeTokens(s) for s in x])

In [None]:
# tokenize 'abstract' column
df['tokenized_abstract'] = df['abstract'].progress_apply(lambda x: [lucem_illud.word_tokenize(s) for s in lucem_illud.sent_tokenize(x)])

In [63]:
# nomalize 'tokenized_abstract' column
df['normalized_abstract'] = df['tokenized_abstract'].apply(lambda x: [lucem_illud.normalizeTokens(s) for s in x])

In [64]:
# assign data types
df = df.astype({'first_name': 'object', 'middle_name': 'object', 'last_name': 'object', 'email': 'object'})
df = df.astype({'title': 'object', 'journal': 'object', 'authors': 'object', 'abstract': 'object'})
df = df.astype({'total_citation': 'int64', 'yearly_citation': 'object', 'award_year': 'category', 'url': 'object'})
df['publication_date'] = pd.to_datetime(df['publication_date'])

In [65]:
# reset index
df = df.reset_index(drop=True)

In [None]:
df.info()

In [None]:
# save the dataframe to a csv file
file_path = 'database/cleaned_pub_info_2016_2022.csv'
df.to_csv(file_path)