# Edge List and Data Cleaning

In [1]:
%matplotlib inline

# ignore deprecated warning
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import sql, SparkContext, SparkConf
from pyspark.sql.functions import *

In [2]:
# create spark session
spark = SparkSession.builder.appName("project").getOrCreate()

In [4]:
# import custom class
%run '../src/text_preprocessing.py'

In [5]:
# instantiate the class
text_tf = TextPreprocessing() 

## Load raw data

In [7]:
base_path = ' '
df = pd.read_csv(base_path + "/nlp-text-analytics/citation-count-prediction/data/data_processed.csv", 
sep = '\t'
)
df = df[list(df.columns)].astype(str) # convert to string

cols = [
    'arXiv_id', 
    'authors', 
    'title',
]
df = df[cols].sample(frac=1) # shuffle the data
print("Data size:", df.shape)
df.head()

Data size: (295174, 3)


Unnamed: 0,arXiv_id,authors,title
149884,1709.0458,"['JeongY. H.', 'YangS. -R. Eric']",Topological end states and Zak phase of rectan...
82141,1308.4277,"['AcharyyaMuktishPresidency University, India']",Dynamic-Symmetry-Breaking Breathing and Spread...
141516,1703.07292,"['WangLin-Lin', 'JoNa Hyun', 'WuYun', 'WuQuanS...",Phonon-Induced Topological Transition to a Typ...
144602,1705.08898,"['MilletariMirco', 'OffidaniManuel', 'Ferreira...",Covariant conservation laws and the spin Hall ...
20792,902.4176,"['FischerA. M.', 'DzyubenkoA. B.', 'RoemerR. A.']",Localized Collective Excitations in Doped Grap...


The authors column is a string representation of a list, which needs to be converted to a list of strings (authors).

In [8]:
df.authors = df.authors.apply(text_tf.string_to_list)

## Edge list dataframe for network analytics
Paper-author edge list dataframe to be used in network analytics

In [9]:
# list of papers and authors
paper_author =[(df.arXiv_id.iloc[i], df.authors.iloc[i]) for i in np.arange(df.shape[0])] 

In [10]:
paper_author[10:12]

[('806.0532',
  ['JiaYing',
   'ChengPeng',
   'FangLei',
   'LuoHuiqian',
   'YangHuan',
   'RenCong',
   'ShanLei',
   'GuChangzhi',
   'WenHai-Hu']),
 ('1907.135',
  ['SasamaYosuke',
   'KomatsuKatsuyoshi',
   'MoriyamaSatoshi',
   'ImuraMasataka',
   'SugiuraShiori',
   'TerashimaTaichi',
   'UjiShinya',
   'WatanabeKenji',
   'TaniguchiTakashi',
   'UchihashiTakashi',
   'TakahideYamaguchi'])]

In [11]:
# create edge list dataframe
pdf_prenet = pd.concat(
    [
        pd.DataFrame(
        {
            'arXiv_id': paper_id, 
            'author': author_nm, 
        }
    ) 
     for paper_id, author_nm in paper_author
    ],
    ignore_index = True
)

In [12]:
pdf_prenet.head()

Unnamed: 0,arXiv_id,author
0,1709.0458,JeongY. H.
1,1709.0458,YangS. -R. Eric
2,1308.4277,"AcharyyaMuktishPresidency University, India"
3,1703.07292,WangLin-Lin
4,1703.07292,JoNa Hyun


In [13]:
# sort by length of author's name
pdf_prenet['length_auth'] = pdf_prenet['author'].apply(len)
pdf_prenet.sort_values(by = 'length_auth', ascending=True, inplace=True)
pdf_prenet.reset_index(drop=True, inplace=True)

In [14]:
print("Data size:", pdf_prenet.shape)
pdf_prenet.head(10)

Data size: (1148170, 3)


Unnamed: 0,arXiv_id,author,length_auth
0,cond-mat/0602237,.,1
1,808.2421,.,1
2,cond-mat/0307090,.,1
3,cond-mat/0310186,;,1
4,cond-mat/0106319,.,1
5,cond-mat/9912306,.,1
6,cond-mat/9607120,.,1
7,1011.4227,3,1
8,cond-mat/9705197,.,1
9,cond-mat/0211501,.,1


In [15]:
pdf_prenet.tail(10)

Unnamed: 0,arXiv_id,author,length_auth
1148160,2009.03177,CaiSonghuaNational Laboratory of Solid State M...,315
1148161,1509.00973,TanakaIsaoDepartment of Materials Science and ...,316
1148162,1712.01738,SchubertMathiasTerahertz Materials Analysis Ce...,319
1148163,2001.08494,"DekkerRiande IInstitut für Physik, Johannes Gu...",320
1148164,2006.15161,PalaciosPabloDepartamento de Física Aplicada a...,330
1148165,2006.14689,RobinsonJoshua A.Department of Materials Scien...,333
1148166,2009.01111,CrespiVincent H.Department of Materials Scienc...,359
1148167,2001.06664,PeiYujuanNingbo Institute of Materials Technol...,365
1148168,1801.0196,PanNanHefei National Laboratory for Physical S...,398
1148169,1801.0196,WangXiaopingHefei National Laboratory for Phys...,404


## Text preprocessing

The edge list dataframe suggests further cleaning as some authors' names appear with the corresponding affiliation institutions, while others names are just numbers, special characters, and punctuations. The following preprocessing steps will be performed on the edge list dataframe:

- Remove author's name with length < 3.
- Split authors' names at white space followed by period puntuation mark.
- Identify other words to split on, such as Department, University, etc.
- Remove special characters and punctuations from author's name.
- Identify the same author with two or more different names and normalize the names.

In [19]:
df_pre = pdf_prenet.copy()

In [21]:
# filter out author's name with length < 3
df_pre = df_pre.copy()[df_pre.length_auth > 2]
df_pre.head(10)

Unnamed: 0,arXiv_id,author,length_auth
69,1601.04103,Xia,3
70,1304.6025,XuX,3
71,cond-mat/9412076,IPN,3
72,1302.4791,WuW,3
73,1711.00869,Lee,3
74,1908.00124,Xia,3
75,2004.02433,WuX,3
76,903.0674,NiN,3
77,1805.05471,Fan,3
78,1901.01682,Xia,3


In [75]:
df_pre['length_auth'] = df_pre['author'].apply(len)
df_pre.sort_values(by = 'length_auth', ascending=True, inplace=True)
df_pre.reset_index(drop=True, inplace=True)
df_pre.tail(10)

Unnamed: 0,arXiv_id,author,length_auth
249283,1705.06117,BergerHelmuthEcole Polytechnique Fédérale de L...,82
249284,1909.12290,GallmeyerThomas G.Alliance for the Development...,82
249285,1906.07641,FerraraEnzoDivisione di Metrologia dei Materia...,82
249286,1807.04273,"KolenderskiPiotrFaculty of Physics, Astronomy ...",83
249287,1906.07641,CoïssonMarcoDivisione di Metrologia dei Materi...,83
249288,1612.06301,LecouturierF.Laboratoire National des Champs M...,83
249289,1705.06165,GhiringhelliG.Dip. di Fisica - Politecnico di ...,83
249290,1705.06165,De LucaG. M.Dip. di Fisica - U. di Napoli Fede...,84
249291,cond-mat/0608084,"BertrandDamienCP3, UCL, Louvain-la-Neuve, Belg...",84
249292,1909.12290,MoorthySenthamilaruviAlliance for the Developm...,85


In [76]:
# splitting is done by manually selecting different strings to split Thailand 
df_pre.author = df_pre.author.apply(text_tf.split_extract, args=("1909.12290",))

In [169]:
# select authors with 2 < length < 20
df_pre = df_pre[(df_pre.length_auth>2)&(df_pre.length_auth<20)]
print("Data size:", pdf_prenet.shape)
df_pre.tail()

Data size: (1148170, 5)


Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1118796,2003.05713,Caroca-CanalesNubia,1,2020,19
1118797,802.4419,MunarrizJavierBIFI-,1,2008,19
1118798,1211.7265,WeidemüllerMatthias,1,2012,19
1118799,2006.12181,RadenovicAleksandra,1,2020,19
1118800,904.0659,KulakowskiKrzysztof,1,2009,19


In [170]:
# save as csv
df_pre.to_csv('../data/in_progress.csv', sep = '\t', index = False)

In [171]:
# df_pre.author = df_prepro.author.apply(text_tf.tokenizer)

In [172]:
# df_pre[df_prepro.author == 'SankaranarayananS. K. R. S.']
# df_pre.author.replace({
# 'SankaranarayananS. K. R. S.':'SankaranarayananSubramanian K. R. S.'
# },
# inplace = True
# )

In [173]:
# create Spark dataframe
sdf = spark.createDataFrame(df_pre)
sdf.createOrReplaceTempView('edge_list')

In [174]:
spark.sql(
    """select distinct author 
    from edge_list 
    where author like 'Loss%' 
    """
    ).show(50)

+--------------+
|        author|
+--------------+
|    LossDaniel|
|LossDanielUniv|
|   LossDanielU|
|         LossM|
| LossDanielIBM|
|   LossMichael|
+--------------+



In [175]:
# Author's name normalization
auth_dict = {
    'WatanabeK':'WatanabeKenji', 
    'TaniguchiT': 'TaniguchiTakashi', 
    'LossD':'LossDaniel', 
    'LossDanielBasel':'LossDaniel',
    'LossDanielBasel,':'LossDaniel', 
    'LossDanielUniv,':'LossDaniel',
    'LossDanielU,':'LossDaniel', 
    'LossDanielIBM,':'LossDaniel', 
    'OwerreSolomon A': 'OwerreS A', 
    'OwerreSolomon Akaraka': 'OwerreS A',
     'BiWenli': 'BiW',
    'DasSarmaS': 'SarmaS Das',
    }
df_pre.author.replace(auth_dict, inplace=True)

In [176]:
# save as csv
df_pre.to_csv('../data/net_data.csv', sep='\t', index=False)