# Data Cleaning

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import ast
import string

# Pyspark modules
from pyspark.sql import SparkSession
from pyspark import sql, SparkContext, SparkConf
from pyspark.sql.functions import *

In [2]:
# Create spark session
spark = SparkSession.builder.appName("project").getOrCreate()

# Load raw data

In [4]:
base_path = ''
df = pd.read_csv(base_path + "/arXiv_papers/data/data_processed.csv", sep = '\t')

df = df[list(df.columns)].astype(str) # convert to string

print("Data size:", df.shape)
cols = ['arXiv_id', 'year','authors', 'title']
df[cols].sample(frac = 1).head()

Data size: (295174, 20)


Unnamed: 0,arXiv_id,year,authors,title
285558,cond-mat/9909037,1999,['de MelloE. V. L.'],Crossover behavior for complex order parameter...
92698,1404.7371,2014,"['SchulzFabian', 'DrostRobert', 'HämäläinenSam...",Epitaxial hexagonal boron nitride on Ir(111): ...
277004,cond-mat/9711270,1997,['MirandaJose A.Department of Physics-Carnegie...,Weakly nonlinear investigation of the Saffman-...
118575,1511.08186,2015,"['PanY.', 'NikitinA. M.', 'WuD.', 'HuangY. K.'...",Quantum oscillations of the topological surfac...
87601,1312.7058,2013,"['KumarPradeep', 'GharaS.', 'RajeswaranB.', 'M...","Temperature dependent magnetic, dielectric and..."


# Preprocessing of raw data
The authors column is a string representation of a list, which needs to be converted to a list of authors

In [5]:
def string_to_list(x):
    """
    Convert string representation of list to a list
    
    Parameter
    ---------
    x: string representation of list. E.g. x = '[ "a","b","c" ]'
    
    Returns
    -------
    list of strings. E.g. x = ["a","b","c"]
    
    """
    x = ast.literal_eval(x)
    return x

In [6]:
df.authors = df.authors.apply(string_to_list)

# Edgelist dataframe
Paper-author edgelist dataframe to be used in network analytics

In [8]:
# list of papers & authors
paper_author_list =[] 
for i in np.arange(len(df)):
    content =(df.arXiv_id.iloc[i], df.authors.iloc[i])
    paper_author_list.append(content)

In [9]:
# create edge list dataframe
pdf_net = pd.concat(
    [pd.DataFrame({'arXiv_id':paper_author_list[i][0], 
                   'author':paper_author_list[i][1], 
                   'rsp': 1}) 
     for i, val in enumerate(paper_author_list)
    ],
    ignore_index = True
)

In [10]:
pdf_net = pdf_net.merge(df[['arXiv_id', 'year']], how = 'left', on = ['arXiv_id'])

In [28]:
pdf_net['length_auth'] = pdf_net['author'].apply(len)
pdf_net.sort_values(by = 'length_auth', ascending = True, inplace = True)
pdf_net.reset_index(drop=True, inplace = True)
print("Data size:", pdf_net.shape)
pdf_net.head(10)

Data size: (1148170, 5)


Unnamed: 0,arXiv_id,author,rsp,year,length_auth
0,1311.0064,2,1,2013,1
1,907.2325,.,1,2009,1
2,cond-mat/0306301,;,1,2003,1
3,1011.4227,3,1,2010,1
4,1710.05813,§,1,2017,1
5,cond-mat/9803184,.,1,1998,1
6,1703.08451,.,1,2017,1
7,cond-mat/0612307,;,1,2006,1
8,1004.1079,.,1,2010,1
9,1309.1017,-,1,2013,1


In [2878]:
pdf_net.tail(10)

Unnamed: 0,arXiv_id,author,rsp,year,length_auth
1148160,1409.0152,"CorredorL. T.Departamento de Física, Universid...",1,2014,315
1148161,1509.00973,TanakaIsaoDepartment of Materials Science and ...,1,2015,316
1148162,1712.01738,SchubertMathiasTerahertz Materials Analysis Ce...,1,2017,319
1148163,2001.08494,"DekkerRiande IInstitut für Physik, Johannes Gu...",1,2020,320
1148164,2006.15161,PalaciosPabloDepartamento de Física Aplicada a...,1,2020,330
1148165,2006.14689,RobinsonJoshua A.Department of Materials Scien...,1,2020,333
1148166,2009.01111,CrespiVincent H.Department of Materials Scienc...,1,2020,359
1148167,2001.06664,PeiYujuanNingbo Institute of Materials Technol...,1,2020,365
1148168,1801.0196,PanNanHefei National Laboratory for Physical S...,1,2018,398
1148169,1801.0196,WangXiaopingHefei National Laboratory for Phys...,1,2018,404


# Preprocessing of network data

From the dataframe above, some authors appear with their affiliation institutes, whereas others names are just numbers, special characters, and punctuations. The former will be normalized and the latter will be removed.  The following preprocessing steps will be performed on the edgelist dataframe:

1. Split author's names at period puntuation mark.
2. Identify other words to split at such as Department, University, etc.
3. Remove special characters and punctuations from author's name
4. Remove author's name with length < 3
5. Identify same author with two or more different name formats, and normalize the names

In [2890]:
df_pro = pdf_net.copy()

In [2909]:
def split_extract(text, split_on = None):
    """
    Split text and extract the first element.
    
    Parameters
    ----------
    text: a string of texts or sentences
    split_on: string to split on

    Returns
    ----------
    first element in text
    """
    text = text.split(split_on)
    return text[0]

def tokenizer(text):
    """
    tokenizer removes special characters and punctuations

    Parameters
    ----------
    text: a string of texts or sentences

    Returns
    ----------
    text without special characters and punctuations 
    """
    symbols = string.punctuation + '0123456789\n'
    nospe_char = [char for char in text if char not in symbols]
    nospe_char = ''.join(nospe_char)
    return nospe_char

In [3212]:
df_pro.author = df_pro.author.apply(split_extract, args = ("IBM",))

In [3213]:
# df_pro.author = df_pro.author.apply(tokenizer)

In [3214]:
# df_prepr[df_prepr.author == 'SankaranarayananS. K. R. S.']
# df_prepr.author.replace({'SankaranarayananS. K. R. S.':'SankaranarayananSubramanian K. R. S.'}, inplace = True)

In [3215]:
# save as csv
df_pro.to_csv('../data/in_progress.csv', sep = '\t', index = False)

In [3216]:
df_pro['length_auth'] = df_pro['author'].apply(len)
df_pro.sort_values(by = 'length_auth', ascending = True, inplace = True)
df_pro.reset_index(drop=True, inplace = True)
# df_pro[df_pro.length_auth == 21]
# df_pro[(df_pro.length_auth > 2)&(df_pro.length_auth <= 20)]
# df_pro_10 = df_pro[df_pro.year >= '2010'] # from 2010
# df_pro_10[df_pro_10.length_auth == 35]

In [3217]:
df_net = df_pro[(df_pro.length_auth > 2)&(df_pro.length_auth <= 20)]
# # save as csv
# df_net.to_csv('../data/network_data.csv', sep = '\t', index = False)

In [3218]:
# Spark dataframe to facilitate sql queries
sdf = spark.createDataFrame(df_net)
sdf.createOrReplaceTempView('table')

In [3219]:
# df_pro[df_pro.author == 'LossDanielIBM']

Unnamed: 0,arXiv_id,author,rsp,year,length_auth


In [3221]:
spark.sql(" select distinct author from table where author like 'Taniguchi%' ").show(50)

+--------------------+
|              author|
+--------------------+
|          TaniguchiS|
|          TaniguchiH|
|     TaniguchiTakumi|
|     TaniguchiTakuya|
|    TaniguchiTakaaki|
|    TaniguchiNatsumi|
|    TaniguchiKeisuke|
|      TaniguchiJunko|
|   TaniguchiTakanori|
|     TaniguchiHiroki|
|    TaniguchiDaisuke|
|     TaniguchiHiromi|
|          TaniguchiT|
|  TaniguchiTakahashi|
|      TaniguchiTooru|
|          TaniguchiK|
|     TaniguchiHaruka|
|   TaniguchiYasutaka|
|    TaniguchiTakeshi|
|    TaniguchiTakashi|
|  TaniguchiToshifumi|
|          TaniguchiN|
|          TaniguchiM|
|   TaniguchiTomohiro|
|   TaniguchiMasateru|
|TaniguchiTakashiNIMS|
|       TaniguchiKoki|
|     TaniguchiMasaki|
|   TaniguchiNobuhiko|
+--------------------+



In [3222]:
# Author's name normalization
auth_dict = {'WatanabeK':'WatanabeKenji', 'TaniguchiT': 'TaniguchiTakashi', 
             'LossD':'LossDaniel', 'LossDanielBasel':'LossDaniel', 'OwerreSolomon A': 'OwerreS A', 
             'OwerreSolomon Akaraka': 'OwerreS A', 'BiWenli': 'BiW',
             'DasSarmaS': 'SarmaS Das'
            }
df_net.author.replace(auth_dict, inplace = True)

In [3223]:
print('Number of unique authors:', len(df_net.author.unique()))

Number of unique authors: 243638


In [3224]:
# save as csv
df_net.to_csv('../data/network_data.csv', sep = '\t', index = False)