# Construction of Daily Time Series Data

In [1]:
# ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# data manipulation
import pandas as pd
import numpy as np

# set font scale and style
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})

# Load raw data

In [2]:
df = pd.read_csv('../data/raw_data.csv', sep ='\t') 
print("Data size:", df.shape)
df.head()

Data size: (139361, 8)


Unnamed: 0,abstract,arXiv_id,authors,categories,comments,date_created,doi,title
0,A nonperturbative renormalization of the phi...,704.0142,['KaupuzsJ.'],cond-mat,"11 pages, no figures. This version is consiste...",2007-04-02,10.1142/S0217979209054697,Some aspects of the nonperturbative renormaliz...
1,We present a theory of transport through int...,704.0204,"['PalaMarco G.', 'GovernaleMichele', 'KönigJür...",cond-mat,"11 pages, 4 figures",2007-04-02,10.1088/1367-2630/9/8/278,Non-Equilibrium Josephson and Andreev Current ...
2,We review the algebraic construction of the ...,704.04,['BeisertNiklas'],cond-mat,"12 pages, contributed to the Solvay workshop ""...",2007-04-03,,The S-Matrix of AdS/CFT and Yangian Symmetry
3,According to extensive experimental findings...,704.0544,"['PatsahanO. V.', 'CaillolJ. -M.', 'MryglodI. ...",cond-mat,"23 pages, 8 figures",2007-04-04,10.1140/epjb/e2007-00247-7,Crossover behavior in fluids with Coulomb inte...
4,It is known that a subset of fractional quan...,704.057,"['HanssonHans', 'ChangChia-Chen', 'JainJainend...",cond-mat,"26 pages, 3 figures",2007-04-04,10.1103/PhysRevB.76.075347,Composite fermion wave functions as conformal ...


In [3]:
ts = df.groupby(['date_created', 'categories']).agg({'arXiv_id':'count'})
ts =  pd.DataFrame(ts).reset_index()

In [4]:
ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21433 entries, 0 to 21432
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date_created  21433 non-null  object
 1   categories    21433 non-null  object
 2   arXiv_id      21433 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 502.5+ KB


# Extract cyclic features

In [5]:
# Extract year, month, weekday
ts['date_created'] = pd.to_datetime(ts['date_created'])
ts['year'] = ts['date_created'].dt.year
ts['month'] = ts['date_created'].dt.month_name() 
ts['weekday'] = ts['date_created'].dt.day_name()

In [6]:
col = {'arXiv_id':'count'}
ts.rename(columns=col, inplace=True)
ts.index = ts.date_created
ts.drop(['date_created'], axis=1, inplace=True)

In [7]:
# change weekday to integer
wday_dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 
             'Friday':5,  'Saturday':6,   'Sunday':0
            }

# change weekday to weekend indicator
weekend_dict = {1:'false', 2:'false', 3:'false', 4:'false', 5:'false', 
                6:'true', 0:'true'}

# change month to integer
month_dict = {'January':1, 'February':2, 'March':3, 
              'April':4,  'May':5, 'June':6,  'July':7,  
              'August':8, 'September':9,'October':10, 
              'November':11, 'December':12
             }

# change month to season
sea_dict = {1:'winter', 2:'winter', 3:'spring', 
              4:'spring',  5:'spring', 6:'summer', 
            7:'summer',  8:'summer', 9:'fall',
            10:'fall', 11:'fall', 12:'winter'
             }

ts['weekday'].replace(wday_dict, inplace = True)
ts['month'].replace(month_dict, inplace = True)
ts['season'] = ts['month'].replace(sea_dict)
ts['weekend'] = ts['weekday'].replace(weekend_dict)

In [8]:
start_date, end_date = '2010-01-01', '2014-12-31'
cols = ['categories', 'year', 'month', 'weekday',
        'season', 'weekend', 'count']
ts = ts[cols][start_date:end_date]
ts

Unnamed: 0_level_0,categories,year,month,weekday,season,weekend,count
date_created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01,astro-ph,2010,1,5,winter,false,1
2010-01-01,cond-mat,2010,1,5,winter,false,4
2010-01-01,gr-qc,2010,1,5,winter,false,1
2010-01-01,hep-th,2010,1,5,winter,false,5
2010-01-02,astro-ph,2010,1,6,winter,true,4
...,...,...,...,...,...,...,...
2014-12-29,hep-th,2014,12,1,winter,false,6
2014-12-30,astro-ph,2014,12,2,winter,false,8
2014-12-30,cond-mat,2014,12,2,winter,false,5
2014-12-30,gr-qc,2014,12,2,winter,false,2


In [9]:
# save data as csv
ts.to_csv('../data/daily_ts_data.csv', index = True)