In [56]:
import json
import requests

import os
import pandas as pd

In [67]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

# Common constants for Legacy and Pageviews
project_domain = 'en.wikipedia.org'
granularity = 'monthly'

# Constants for Legacy
access_desktop_legacy, access_mobile_legacy = 'desktop-site', 'mobile-site'
start_time_legacy = 2007120100 # December 2007
end_time_legacy = 2016080100 # Ending July 2016

# Constants for Pageviews
access_desktop_pageviews, access_mobileapp_pageviews, access_mobileweb_pageviews = 'desktop', 'mobile-app', 'mobile-web'
agent_pageviews = 'user'
start_time_pageviews = 2015070100 # July 2015
end_time_pageviews = 2021100100 # Ending September 2021

headers = {
    'User-Agent': 'https://github.com/StaceyWheeler',
    'From': 'swheele@uw.edu'
}

RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'
VISUALIZATIONS_DATA_PATH = '../data/visualizations'

PROCESSED_CSV_FILE_NAME = 'en-wikipedia_traffic_200712-202109.csv'
# GRAPH_FILE_NAME = 'en-wikipedia_traffic_200712-202109.png'

In [58]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [59]:
def get_data():
    
    for access in [access_desktop_legacy, access_mobile_legacy]:
        
        parameters_legacy = {
                                'project': project_domain,
                                'access-site': access,
                                'granularity': granularity,
                                'start': start_time_legacy,
                                'end': end_time_legacy
                            }
        
        monthly_legacy = api_call(endpoint_legacy, parameters_legacy)
        
        legacy_file_name = f'pagecounts_{access}_{start_time_legacy}-{end_time_legacy}.json'
        file_path = os.path.join(RAW_DATA_PATH, legacy_file_name) 
        with open(file_path, 'w') as f:
            json.dump(monthly_legacy, f)
            
    for access in [access_desktop_pageviews, access_mobileapp_pageviews, access_mobileweb_pageviews]:
            
        parameters_pageviews = {
                                    'project': project_domain,
                                    'access': access,
                                    'agent': agent_pageviews,
                                    'granularity': granularity,
                                    'start': start_time_pageviews,
                                    'end': end_time_pageviews
                                }
            
        monthly_pageviews = api_call(endpoint_pageviews, parameters_pageviews)
        
        pageviews_file_name = f'pageviews_{access}_{start_time_pageviews}-{end_time_pageviews}.json'
        file_path = os.path.join(RAW_DATA_PATH, pageviews_file_name) 
        with open(file_path, 'w') as f:
            json.dump(monthly_pageviews, f)

In [60]:
get_data()

In [61]:
raw_files_paths

['pagecounts_mobile-site_2007120100-2016080100.json',
 'pagecounts_mobile-web_2015070100-2021100100.json',
 'pagecounts_desktop_2015070100-2021100100.json',
 'pagecounts_desktop-site_2007120100-2016080100.json',
 'pagecounts_mobile-app_2015070100-2021100100.json']

In [62]:
raw_files_paths = [x for x in os.listdir(RAW_DATA_PATH) if '.json' in x]

df_list = []
# print(raw_files_paths)
# print("-------")
for raw_files_path in raw_files_paths:
    with open(os.path.join(RAW_DATA_PATH, raw_files_path), 'r') as f:
      data = json.load(f)
#     print(raw_files_path)
    df = pd.json_normalize(data,['items'])
    df['Source'] = '_'.join(raw_files_path.split('_')[:2])
    
    df_list.append(df)

# Concat the data into a single df
df = pd.concat(df_list)

In [63]:
df

Unnamed: 0,project,access-site,granularity,timestamp,count,Source,access,agent,views
0,en.wikipedia,mobile-site,monthly,2014100100,3.091547e+09,pagecounts_mobile-site,,,
1,en.wikipedia,mobile-site,monthly,2014110100,3.027490e+09,pagecounts_mobile-site,,,
2,en.wikipedia,mobile-site,monthly,2014120100,3.278950e+09,pagecounts_mobile-site,,,
3,en.wikipedia,mobile-site,monthly,2015010100,3.485302e+09,pagecounts_mobile-site,,,
4,en.wikipedia,mobile-site,monthly,2015020100,3.091534e+09,pagecounts_mobile-site,,,
...,...,...,...,...,...,...,...,...,...
70,en.wikipedia,,monthly,2021050100,,pageviews_mobile-app,mobile-app,user,166485079.0
71,en.wikipedia,,monthly,2021060100,,pageviews_mobile-app,mobile-app,user,150704624.0
72,en.wikipedia,,monthly,2021070100,,pageviews_mobile-app,mobile-app,user,161461155.0
73,en.wikipedia,,monthly,2021080100,,pageviews_mobile-app,mobile-app,user,161381193.0


In [64]:
df['views'] = df['views'].combine_first(df['count'])
df.drop(columns=['project', 'granularity', 'count', 'access-site', 'access', 'agent'], inplace=True)
df = pd.pivot_table(df, values='views', columns='Source', index=['timestamp']).reset_index()

df

Source,timestamp,pagecounts_desktop,pagecounts_desktop-site,pagecounts_mobile-app,pagecounts_mobile-site,pagecounts_mobile-web,pageviews_desktop,pageviews_mobile-app,pageviews_mobile-web
0,2007120100,,2.998332e+09,,,,,,
1,2008010100,,4.930903e+09,,,,,,
2,2008020100,,4.818394e+09,,,,,,
3,2008030100,,4.955406e+09,,,,,,
4,2008040100,,5.159162e+09,,,,,,
...,...,...,...,...,...,...,...,...,...
161,2021050100,2.824416e+09,,166485079.0,,4.810094e+09,2.824416e+09,166485079.0,4.810094e+09
162,2021060100,2.505971e+09,,150704624.0,,4.433806e+09,2.505971e+09,150704624.0,4.433806e+09
163,2021070100,2.765584e+09,,161461155.0,,4.617448e+09,2.765584e+09,161461155.0,4.617448e+09
164,2021080100,2.763414e+09,,161381193.0,,4.570813e+09,2.763414e+09,161381193.0,4.570813e+09


In [65]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')

df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month

df.fillna({'pagecounts_desktop-site':0, 
           'pagecounts_mobile-site':0,
           'pageviews_desktop':0,
           'pageviews_mobile-app':0,
           'pageviews_mobile-web':0}, inplace=True)

df['pageview_mobile_views'] = df['pageviews_mobile-app'].fillna(0) + df['pageviews_mobile-web'].fillna(0)
df['pageview_all_views'] = df['pageview_mobile_views'].fillna(0) + df['pageviews_desktop'].fillna(0)
df['pagecount_all_views'] = df['pagecounts_desktop-site'].fillna(0) + df['pagecounts_mobile-site'].fillna(0)

df.rename(columns={'pagecounts_desktop-site': 'pagecount_desktop_views', 
                   'pagecounts_mobile-site': 'pagecount_mobile_views',
                    'pageviews_desktop': 'pageview_desktop_views'}, inplace=True)
df = df[['year', 'month', 'pagecount_all_views', 'pagecount_desktop_views', 'pagecount_mobile_views', 'pageview_all_views', 'pageview_desktop_views', 'pageview_mobile_views']]

In [66]:
df

Source,year,month,pagecount_all_views,pagecount_desktop_views,pagecount_mobile_views,pageview_all_views,pageview_desktop_views,pageview_mobile_views
0,2007,12,2.998332e+09,2.998332e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
1,2008,1,4.930903e+09,4.930903e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
2,2008,2,4.818394e+09,4.818394e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
3,2008,3,4.955406e+09,4.955406e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
4,2008,4,5.159162e+09,5.159162e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...
161,2021,5,0.000000e+00,0.000000e+00,0.0,7.800996e+09,2.824416e+09,4.976580e+09
162,2021,6,0.000000e+00,0.000000e+00,0.0,7.090482e+09,2.505971e+09,4.584510e+09
163,2021,7,0.000000e+00,0.000000e+00,0.0,7.544494e+09,2.765584e+09,4.778909e+09
164,2021,8,0.000000e+00,0.000000e+00,0.0,7.495608e+09,2.763414e+09,4.732194e+09


In [68]:
processed_file_path = os.path.join(PROCESSED_DATA_PATH, PROCESSED_CSV_FILE_NAME)
df.to_csv(processed_file_path, index=False)