In [1]:
import h5py
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset in hdf5 format. 
# The dataset was downloaded from 'https://cloudstor.aarnet.edu.au/plus/s/wQbOswE7qi50mci'
f= h5py.File('wiki.views-only.hdf5',"r")
list(f.keys())

['views']

##### Description of the above dataset
The data collection starts with the raw dump of the English
Wikipedia2
containing the full revision history of 17 million articles.
From this the authors of the paper (Radflow: A Recurrent, Aggregated, and Decomposable Model for Networks of Time Series.) collected daily view counts from 1 July 2015 to 30 June
2020(1827 days). They then removed articles with less than 100 daily average
views in the final 140 days. This leaves us with 366,802 pages. Therefore, df columns = 1827 days and rows are the view of 366802 pages.

In [3]:
from datetime import datetime as dt, timedelta as td
date = pd.date_range(start="2015-07-01",end="2020-06-30").tolist()

In [4]:
dset = f['views']
df = pd.DataFrame(np.array(dset),columns=date)
df

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2020-06-21,2020-06-22,2020-06-23,2020-06-24,2020-06-25,2020-06-26,2020-06-27,2020-06-28,2020-06-29,2020-06-30
0,1764,2943,1940,1598,1740,1867,1792,1891,1813,1619,...,4073,4213,4914,3851,3859,3732,3672,3750,3484,3399
1,6504,6896,7623,7016,7427,7475,7281,8286,7624,6579,...,4042,4137,4047,3773,3884,4090,3891,4217,3938,3943
2,807,798,704,546,608,836,880,842,843,929,...,661,777,795,644,704,620,556,604,748,731
3,6188,6047,5927,5765,5477,5638,5565,5788,6052,6179,...,2286,2369,2440,2187,2364,2298,2133,2346,2458,2427
4,4410,3222,2928,2972,3802,3636,3580,3309,3439,3792,...,4235,5201,4750,4654,4527,4448,4472,5198,5763,4373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366797,52,40,65,89,52,64,73,90,59,106,...,84,68,105,60,51,63,59,59,83,99
366798,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,477,498,486,448,431,378,403,464,426,387
366799,763,713,624,537,582,675,660,953,913,716,...,292,234,263,254,190,256,182,198,245,208
366800,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1623,1443,1319,1285,1247,1324,1765,1623,1440,1261


In [5]:
# Note the -1 value denotes that the pages didn't exist and 0 denotes no views. 
# I have replaced the -1 values with 0. 
for each in date:
    df[each] = df[each].replace(-1, 0)
df

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2020-06-21,2020-06-22,2020-06-23,2020-06-24,2020-06-25,2020-06-26,2020-06-27,2020-06-28,2020-06-29,2020-06-30
0,1764,2943,1940,1598,1740,1867,1792,1891,1813,1619,...,4073,4213,4914,3851,3859,3732,3672,3750,3484,3399
1,6504,6896,7623,7016,7427,7475,7281,8286,7624,6579,...,4042,4137,4047,3773,3884,4090,3891,4217,3938,3943
2,807,798,704,546,608,836,880,842,843,929,...,661,777,795,644,704,620,556,604,748,731
3,6188,6047,5927,5765,5477,5638,5565,5788,6052,6179,...,2286,2369,2440,2187,2364,2298,2133,2346,2458,2427
4,4410,3222,2928,2972,3802,3636,3580,3309,3439,3792,...,4235,5201,4750,4654,4527,4448,4472,5198,5763,4373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366797,52,40,65,89,52,64,73,90,59,106,...,84,68,105,60,51,63,59,59,83,99
366798,0,0,0,0,0,0,0,0,0,0,...,477,498,486,448,431,378,403,464,426,387
366799,763,713,624,537,582,675,660,953,913,716,...,292,234,263,254,190,256,182,198,245,208
366800,0,0,0,0,0,0,0,0,0,0,...,1623,1443,1319,1285,1247,1324,1765,1623,1440,1261


In [6]:
df.dtypes

2015-07-01    int32
2015-07-02    int32
2015-07-03    int32
2015-07-04    int32
2015-07-05    int32
              ...  
2020-06-26    int32
2020-06-27    int32
2020-06-28    int32
2020-06-29    int32
2020-06-30    int32
Length: 1827, dtype: object

In [7]:
#Computing the total views in each year.
sum_array = np.array(df.sum(axis=0))

In [8]:
#Creating the csv file of the processed dataset
sum_df = pd.DataFrame(sum_array,columns = ['value'])
date_df = pd.DataFrame(date,columns = ['date'])
t_df = date_df.join(sum_df)
t_df.to_csv('wiki.csv',index =False)