# Edge List and Data Cleaning

In [1]:
%matplotlib inline

# ignore deprecated warning
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import sql, SparkContext, SparkConf
from pyspark.sql.functions import *

In [2]:
# create spark session
spark = SparkSession.builder.appName("project").getOrCreate()

In [3]:
# import custom class
%run '../src/helper/text_preprocessing.py'

In [4]:
# instantiate the class
text_tf = TextPreprocessing() 

## Load raw data

In [7]:
df = pd.read_csv("../data/raw_data.csv", sep = '\t')
df = df[list(df.columns)].astype(str) # convert to string

cols = [
    'arXiv_id', 
    'authors', 
    'title',
]
df = df[cols].sample(frac=1) # shuffle the data
print("Data size:", df.shape)
df.head()

Data size: (55021, 3)


Unnamed: 0,arXiv_id,authors,title
1951,1506.0263,"['KitanineN.', 'MailletJ. M.', 'NiccoliG.', 'T...",On determinant representations of scalar produ...
17098,1706.04636,"['AsmarMahmoud M.', 'UlloaSergio E.']",Minimal Geometry for Valley Filtering in Graphene
12842,1703.05622,"['ReechtGaël', 'HeinrichBenjamin', 'BulouHervé...",Imaging isodensity contours of molecular state...
43022,1812.088,"['KrammerMarkus', 'BorchertJames W.', 'Petritz...",Critical Evaluation of Organic Thin-Film Trans...
40552,1810.11296,"['DrescherMoritz', 'SalmhoferManfred', 'EnssTi...",Real-space dynamics of attractive and repulsiv...


The authors column is a string representation of a list, which needs to be converted to a list of strings (authors).

In [8]:
df.authors = df.authors.apply(text_tf.string_to_list)

## Edge list dataframe for network analytics
Paper-author edge list dataframe to be used in network analytics

In [9]:
# list of papers and authors
paper_author =[(df.arXiv_id.iloc[i], df.authors.iloc[i]) for i in np.arange(df.shape[0])] 

In [10]:
paper_author[10:12]

[('1711.01497',
  ['HuDing',
   'ZhangWenliang',
   'WeiYuan',
   'RoessliBertrand',
   'SkoulatosMarkos',
   'RegnaultLouis Pierre',
   'ChenGenfu',
   'SongYu',
   'LuoHuiqian',
   'LiShiliang',
   'DaiPengcheng']),
 ('1801.09957', ['PawłowskiJ.', 'ŻebrowskiD.', 'BednarekS.'])]

In [11]:
# create edge list dataframe
pdf_prenet = pd.concat(
    [
        pd.DataFrame(
        {
            'arXiv_id': paper_id, 
            'author': author_nm, 
        }
    ) 
     for paper_id, author_nm in paper_author
    ],
    ignore_index = True
)

In [12]:
pdf_prenet.head()

Unnamed: 0,arXiv_id,author
0,1506.0263,KitanineN.
1,1506.0263,MailletJ. M.
2,1506.0263,NiccoliG.
3,1506.0263,TerrasV.
4,1706.04636,AsmarMahmoud M.


In [14]:
# sort by length of author's name
pdf_prenet['length_auth'] = pdf_prenet['author'].apply(len)
pdf_prenet.sort_values(by = 'length_auth', ascending=True, inplace=True)
pdf_prenet.reset_index(drop=True, inplace=True)

In [15]:
print("Data size:", pdf_prenet.shape)
pdf_prenet.head(10)

Data size: (249297, 3)


Unnamed: 0,arXiv_id,author,length_auth
0,1901.06061,y,1
1,1703.08451,.,1
2,1901.11322,.,1
3,1909.11692,Q,1
4,1909.06555,W.,2
5,1704.0827,Xu,2
6,1901.10105,Xia,3
7,1601.04103,Xia,3
8,1802.08152,Luo,3
9,1711.00869,Lee,3


In [17]:
pdf_prenet.tail(10)

Unnamed: 0,arXiv_id,author,length_auth
249287,1705.08216,"StattAntoniaInstitut für Physik, Johannes Gute...",275
249288,1804.08427,"WehlingT. O.Institut für Theoretische Physik, ...",280
249289,cond-mat/0605674,"LazaridesN.Department of Physics, University o...",281
249290,1807.07206,"RabaniEranDepartment of Chemistry, University ...",285
249291,1901.00739,DyachkovS. A.Dukhov Research Institute of Auto...,291
249292,1802.08440,JohanssonBörjeDepartment of Materials Science ...,294
249293,cond-mat/0605674,"EleftheriouM.Department of Physics, University...",302
249294,1712.01738,SchubertMathiasTerahertz Materials Analysis Ce...,319
249295,1801.01960,PanNanHefei National Laboratory for Physical S...,398
249296,1801.01960,WangXiaopingHefei National Laboratory for Phys...,404


## Text preprocessing

The edge list dataframe suggests further cleaning as some authors' names appear with the corresponding affiliation institutions, while others names are just numbers, special characters, and punctuations. The following preprocessing steps will be performed on the edge list dataframe:

- Remove author's name with length = 1.
- Split authors' names at white space followed by period puntuation mark.
- Identify other words to split on, such as Department, University, etc.
- Remove special characters and punctuations from author's name.
- Identify the same author with two or more different names and normalize the names.

In [22]:
df_pre = pdf_prenet.copy()

In [23]:
# filter out author's name with length > 1
df_pre = df_pre.copy()[df_pre.length_auth > 1]
df_pre.head()

Unnamed: 0,arXiv_id,author,length_auth
4,1909.06555,W.,2
5,1704.0827,Xu,2
6,1901.10105,Xia,3
7,1601.04103,Xia,3
8,1802.08152,Luo,3


In [75]:
df_pre['length_auth'] = df_pre['author'].apply(len)
df_pre.sort_values(by = 'length_auth', ascending=True, inplace=True)
df_pre.reset_index(drop=True, inplace=True)
df_pre.tail(10)

Unnamed: 0,arXiv_id,author,length_auth
249283,1705.06117,BergerHelmuthEcole Polytechnique Fédérale de L...,82
249284,1909.12290,GallmeyerThomas G.Alliance for the Development...,82
249285,1906.07641,FerraraEnzoDivisione di Metrologia dei Materia...,82
249286,1807.04273,"KolenderskiPiotrFaculty of Physics, Astronomy ...",83
249287,1906.07641,CoïssonMarcoDivisione di Metrologia dei Materi...,83
249288,1612.06301,LecouturierF.Laboratoire National des Champs M...,83
249289,1705.06165,GhiringhelliG.Dip. di Fisica - Politecnico di ...,83
249290,1705.06165,De LucaG. M.Dip. di Fisica - U. di Napoli Fede...,84
249291,cond-mat/0608084,"BertrandDamienCP3, UCL, Louvain-la-Neuve, Belg...",84
249292,1909.12290,MoorthySenthamilaruviAlliance for the Developm...,85


In [76]:
# splitting is done by manually selecting different strings to split Thailand 
df_pre.author = df_pre.author.apply(text_tf.split_extract, args=("1909.12290",))

In [169]:
# select authors with 2 < length < 20
df_pre = df_pre[(df_pre.length_auth>2)&(df_pre.length_auth<20)]
print("Data size:", pdf_prenet.shape)
df_pre.tail()

Data size: (1148170, 5)


Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1118796,2003.05713,Caroca-CanalesNubia,1,2020,19
1118797,802.4419,MunarrizJavierBIFI-,1,2008,19
1118798,1211.7265,WeidemüllerMatthias,1,2012,19
1118799,2006.12181,RadenovicAleksandra,1,2020,19
1118800,904.0659,KulakowskiKrzysztof,1,2009,19


In [170]:
# save as csv
df_pre.to_csv('../data/in_progress.csv', sep = '\t', index = False)

In [171]:
# df_pre.author = df_prepro.author.apply(text_tf.tokenizer)

In [172]:
# df_pre[df_prepro.author == 'SankaranarayananS. K. R. S.']
# df_pre.author.replace({
# 'SankaranarayananS. K. R. S.':'SankaranarayananSubramanian K. R. S.'
# },
# inplace = True
# )

In [173]:
# create Spark dataframe
sdf = spark.createDataFrame(df_pre)
sdf.createOrReplaceTempView('edge_list')

In [174]:
spark.sql(
    """select distinct author 
    from edge_list 
    where author like 'Loss%' 
    """
    ).show(50)

+--------------+
|        author|
+--------------+
|    LossDaniel|
|LossDanielUniv|
|   LossDanielU|
|         LossM|
| LossDanielIBM|
|   LossMichael|
+--------------+



In [175]:
# Author's name normalization
auth_dict = {
    'WatanabeK':'WatanabeKenji', 
    'TaniguchiT': 'TaniguchiTakashi', 
    'LossD':'LossDaniel', 
    'LossDanielBasel':'LossDaniel',
    'LossDanielBasel,':'LossDaniel', 
    'LossDanielUniv,':'LossDaniel',
    'LossDanielU,':'LossDaniel', 
    'LossDanielIBM,':'LossDaniel', 
    'OwerreSolomon A': 'OwerreS A', 
    'OwerreSolomon Akaraka': 'OwerreS A',
     'BiWenli': 'BiW',
    'DasSarmaS': 'SarmaS Das',
    }
df_pre.author.replace(auth_dict, inplace=True)

In [176]:
# save as csv
df_pre.to_csv('../data/net_data.csv', sep='\t', index=False)