# Data Pre-Processing

In [1]:
%matplotlib inline

# filter warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

In [2]:
# Import custom class
%run -i '../src/helper/scraper.py'

In [3]:
# Instantiate the class
scraper = Scraper()

In [7]:
df = pd.read_csv('../data/raw_data.csv', sep = '\t', low_memory= False)
print('Data size:', df.shape)
df.head()

Data size: (295174, 8)


Unnamed: 0,abstract,arXiv_id,authors,categories,comments,date_created,doi,title
0,We study the two-particle wave function of p...,704.0006,"[PongY. H., LawC. K.]",[cond-mat.mes-hall],"6 pages, 4 figures, accepted by PRA",2007-03-31,10.1103/PhysRevA.75.043613,Bosonic characters of atomic Cooper pairs acro...
1,A general formulation was developed to repre...,704.0008,[SwiftDamian C.],[cond-mat.mtrl-sci],Minor corrections,2007-03-31,10.1063/1.2975338,Numerical solution of shock and ramp compressi...
2,We present recent advances in understanding ...,704.0025,"[MishchenkoA. S.CREST, Japan Science and Techn...","[cond-mat.str-el, cond-mat.stat-mech]","41 pages, 13 figures, in ""Polarons in Advanced...",2007-04-02,10.1007/978-1-4020-6348-0_12,Spectroscopic Properties of Polarons in Strong...
3,We describe a peculiar fine structure acquir...,704.0027,"[GoerbigM. O., FuchsJ. -N., KechedzhiK., Fal'k...",[cond-mat.mes-hall],"4 pages, 2 figures; mistakes due to an erroneo...",2007-04-02,10.1103/PhysRevLett.99.087402,Filling-Factor-Dependent Magnetophonon Resonan...
4,We investigate the effect of tuning the phon...,704.003,"[HagueJ. P., d'AmbrumenilN.]",[cond-mat.str-el],"Reprint to improve access. 13 pages, 6 figures.",2007-03-31,10.1007/s10909-005-6013-6,Tuning correlation effects with electron-phono...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295174 entries, 0 to 295173
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   abstract      295174 non-null  object
 1   arXiv_id      295174 non-null  object
 2   authors       295174 non-null  object
 3   categories    295174 non-null  object
 4   comments      240949 non-null  object
 5   date_created  295174 non-null  object
 6   doi           234381 non-null  object
 7   title         295174 non-null  object
dtypes: object(8)
memory usage: 18.0+ MB


In [10]:
# extract number of authors
df['num_of_authors'] = df['authors'].apply(lambda x: len(x))

In [None]:
# add length of abtract
df['length_of_abstr'] = df.abstract.apply(len)

In [12]:
# convert date created to datetime format
df.date_created = pd.to_datetime(df.date_created) 

df['year'] = df.date_created.dt.year # add year as a column
df['month'] = df.date_created.dt.month_name()  # add month as a column
df['day_of_week'] = df.date_created.dt.day_name() # add day of week as a column

In [None]:
df['p_age'] = 2020 - df['year'] # paper's age

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295174 entries, 0 to 295173
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   abstract        295174 non-null  object        
 1   arXiv_id        295174 non-null  object        
 2   authors         295174 non-null  object        
 3   categories      295174 non-null  object        
 4   comments        240949 non-null  object        
 5   date_created    295174 non-null  datetime64[ns]
 6   doi             234381 non-null  object        
 7   title           295174 non-null  object        
 8   num_of_authors  295174 non-null  int64         
 9   year            295174 non-null  int64         
 10  month           295174 non-null  object        
 11  day_of_week     295174 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 27.0+ MB


In [None]:
# replace None values with np.nan
df[['comments', 'doi']] = df[['comments', 'doi']].replace({None: np.nan}) 

In [14]:
df = df.fillna(" ")

In [15]:
# extract number of pages and figures
df['num_of_pages'] = [scraper.search_comments(df.comments[s])[0] for s in range(len(df.comments))]
df['num_of_figures'] = [scraper.search_comments(df.comments[s])[1] for s in range(len(df.comments))]

In [None]:
df['citation_count'] = 0 # intialize citation counts to 0

In [17]:
# save as csv
df.to_csv('../data/raw_data2.csv', sep = '\t', index=False)