# Data Cleaning

In [75]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import string
import nltk
import ast

# Pyspark modules
from pyspark.sql import SparkSession
from pyspark import sql, SparkContext, SparkConf
from pyspark.sql.functions import *

In [76]:
# Create spark session
spark = SparkSession.builder.appName("project").getOrCreate()

# Load raw data

In [77]:
base_path = '/Users/sowerre/Documents/Python/ML-projects/arXiv_papers'
df = pd.read_csv(base_path + "/citation_prediction/data/data_processed.csv", sep = '\t')

df = df[list(df.columns)].astype(str) # convert to string

print("Data size:", df.shape)
cols = ['arXiv_id', 'year','authors', 'title']
df[cols].sample(frac = 1).head()

Data size: (295174, 20)


Unnamed: 0,arXiv_id,year,authors,title
202110,2007.12345,2020,"['KumarParveen', 'SnizhkoKyrylo']","Comment on ""How to observe and quantify quantu..."
271858,cond-mat/9604036,1996,"['FavandJulien', 'MilaFrédéric']",Theory of the optical conductivity of (TMTSF)$...
271328,cond-mat/9601031,1996,"['DotsenkoA. V.', 'SushkovO. P.']",Temperature dependence of the electron chemica...
144754,1705.10025,2017,"['TangGaomin', 'ChenXiaobin', 'RenJie', 'WangJ...",Rectifying full-counting statistics in a spin ...
220627,cond-mat/0111331,2001,"['LefèvreA.', 'DeanD. S.']",Phase transitions in the steady state behavior...


# Preprocessing of raw data
The authors column is a string representation of a list, which needs to be converted to a list of authors

In [78]:
def string_to_list(x):
    """
    Convert string representation of list to a list
    
    Parameter
    ---------
    x: string representation of list. E.g. x = '[ "a","b","c" ]'
    
    Returns
    -------
    list of strings. E.g. x = ["a","b","c"]
    
    """
    x = ast.literal_eval(x)
    return x

In [79]:
df.authors = df.authors.apply(string_to_list)

# Edgelist dataframe
Paper-author edgelist dataframe to be used in network analytics

In [80]:
# list of papers & authors
paper_author_list =[] 
for i in np.arange(len(df)):
    content =(df.arXiv_id.iloc[i], df.authors.iloc[i])
    paper_author_list.append(content)

In [84]:
paper_author_list[10:12]

[('704.007', ['YanzhangHe', 'ChengguangBao']),
 ('704.0082', ['KurosakiTetsuo', 'WadatiMiki'])]

In [85]:
# create edge list dataframe
pdf_net = pd.concat(
    [pd.DataFrame({'arXiv_id':paper_author_list[i][0], 
                   'author':paper_author_list[i][1], 
                   'rsp': 1}) 
     for i, val in enumerate(paper_author_list)
    ],
    ignore_index = True
)

In [86]:
# add year posted to the edgelist
pdf_net = pdf_net.merge(df[['arXiv_id', 'year']], how = 'left', on = ['arXiv_id'])

In [87]:
# sort by length of author's name
pdf_net['length_auth'] = pdf_net['author'].apply(len)
pdf_net.sort_values(by = 'length_auth', ascending = True, inplace = True)
pdf_net.reset_index(drop=True, inplace = True)
print("Data size:", pdf_net.shape)
pdf_net.head(10)

Data size: (1148170, 5)


Unnamed: 0,arXiv_id,author,rsp,year,length_auth
0,1311.0064,2,1,2013,1
1,cond-mat/0602531,.,1,2006,1
2,cond-mat/0307090,.,1,2003,1
3,1204.6071,1,1,2012,1
4,1006.1851,.,1,2010,1
5,1002.3276,.,1,2010,1
6,cond-mat/0512080,.,1,2005,1
7,710.0054,;,1,2007,1
8,1611.04355,a,1,2016,1
9,1909.11692,Q,1,2019,1


In [88]:
pdf_net.tail(10)

Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1148160,1409.0152,"CorredorL. T.Departamento de Física, Universid...",1,2014,315
1148161,1509.00973,TanakaIsaoDepartment of Materials Science and ...,1,2015,316
1148162,1712.01738,SchubertMathiasTerahertz Materials Analysis Ce...,1,2017,319
1148163,2001.08494,"DekkerRiande IInstitut für Physik, Johannes Gu...",1,2020,320
1148164,2006.15161,PalaciosPabloDepartamento de Física Aplicada a...,1,2020,330
1148165,2006.14689,RobinsonJoshua A.Department of Materials Scien...,1,2020,333
1148166,2009.01111,CrespiVincent H.Department of Materials Scienc...,1,2020,359
1148167,2001.06664,PeiYujuanNingbo Institute of Materials Technol...,1,2020,365
1148168,1801.0196,PanNanHefei National Laboratory for Physical S...,1,2018,398
1148169,1801.0196,WangXiaopingHefei National Laboratory for Phys...,1,2018,404


# Preprocessing of network data

The dataframe above suggests further cleaning as some author's names appear with the corresponding affiliation institutions, and others names are just numbers, special characters, and punctuations. The former will be normalized and the latter will be removed.  The following preprocessing steps will be performed on the edgelist dataframe:

1. Split author's names at white space followed by period puntuation mark.
2. Identify other words to split on, such as Department, University, etc.
3. Remove special characters and punctuations from author's name
4. Remove author's name with length < 3
5. Identify the same author with two or more different name formats, and normalize the names

In [89]:
df_pro = pdf_net.copy()

In [90]:
def tokenizer(text):
    """
    tokenizer removes special characters and punctuations

    Parameters
    ----------
    text: a string of texts or sentences

    Returns
    ----------
    text without special characters and punctuations 
    """
    symbols = string.punctuation + '0123456789\n'
    nospe_char = [char for char in text if char not in symbols]
    nospe_char = ''.join(nospe_char)
    return nospe_char

In [91]:
def split_extract(text, split_on = None):
    """
    Split text and extract the first element.
    
    Parameters
    ----------
    text: a string of texts or sentences
    split_on: string to split on

    Returns
    ----------
    first element in text
    """
    text = text.split(split_on)
    return text[0]

In [165]:
df_pro.author = df_pro.author.apply(split_extract, args = (" ",))

In [168]:
df_pro['length_auth'] = df_pro['author'].apply(len)
df_pro.sort_values(by = 'length_auth', ascending = True, inplace = True)
df_pro.reset_index(drop=True, inplace = True)
df_pro.tail(10)

Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1118791,1208.4232,ManjavacasAlejandro,1,2012,19
1118792,1202.4394,SuzukiNorikazuNihon,1,2012,19
1118793,1902.09249,LiermannHanns-Peter,1,2019,19
1118794,cond-mat/0506088,BhattacharyaSomnath,1,2005,19
1118795,1207.2788,Gomez-GardeñesJesus,1,2012,19
1118796,2003.05713,Caroca-CanalesNubia,1,2020,19
1118797,802.4419,MunarrizJavierBIFI-,1,2008,19
1118798,1211.7265,WeidemüllerMatthias,1,2012,19
1118799,2006.12181,RadenovicAleksandra,1,2020,19
1118800,904.0659,KulakowskiKrzysztof,1,2009,19


In [169]:
# select authors with 2 < length < 20
df_pro = df_pro[(df_pro.length_auth>2)&(df_pro.length_auth<20)]
print("Data size:", pdf_net.shape)
df_pro.tail()

Data size: (1148170, 5)


Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1118796,2003.05713,Caroca-CanalesNubia,1,2020,19
1118797,802.4419,MunarrizJavierBIFI-,1,2008,19
1118798,1211.7265,WeidemüllerMatthias,1,2012,19
1118799,2006.12181,RadenovicAleksandra,1,2020,19
1118800,904.0659,KulakowskiKrzysztof,1,2009,19


In [170]:
# save as csv
df_pro.to_csv('../data/in_progress.csv', sep = '\t', index = False)

In [171]:
# df_pro.author = df_pro.author.apply(tokenizer)

In [172]:
# df_pro[df_pro.author == 'SankaranarayananS. K. R. S.']
# df_pro.author.replace({'SankaranarayananS. K. R. S.':'SankaranarayananSubramanian K. R. S.'},
# inplace = True)

In [173]:
# create Spark dataframe
sdf = spark.createDataFrame(df_pro)
sdf.createOrReplaceTempView('table')

In [174]:
spark.sql(" select distinct author from table where author like 'Loss%' ").show(50)

+--------------+
|        author|
+--------------+
|    LossDaniel|
|LossDanielUniv|
|   LossDanielU|
|         LossM|
| LossDanielIBM|
|   LossMichael|
+--------------+



In [175]:
# Author's name normalization
auth_dict = {'WatanabeK':'WatanabeKenji', 'TaniguchiT': 'TaniguchiTakashi', 
             'LossD':'LossDaniel', 'LossDanielBasel':'LossDaniel',
             'LossDanielBasel,':'LossDaniel', 'LossDanielUniv,':'LossDaniel',
             'LossDanielU,':'LossDaniel', 'LossDanielIBM,':'LossDaniel', 
             'OwerreSolomon A': 'OwerreS A', 
             'OwerreSolomon Akaraka': 'OwerreS A', 'BiWenli': 'BiW',
             'DasSarmaS': 'SarmaS Das'
            }
df_pro.author.replace(auth_dict, inplace = True)

In [176]:
# save as csv
df_pro.to_csv('../data/network_data.csv', sep = '\t', index = False)