In [1]:
import json, time, urllib.parse
import requests
import pandas as pd
import concurrent.futures
from collections import defaultdict

# Step 1 : Data Acquisition

#### Define API and Headers

In [2]:
API = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/{access_type}/user/{article}/monthly/20150701/20230930'

HEADERS = {
    'User-Agent': 'sbutala@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

#### Parse the subset of articles which need to be processed

In [3]:
csv_file_path = './thank_the_academy.AUG.2023.csv - thank_the_academy.AUG.2023b.csv'
df = pd.read_csv(csv_file_path, usecols=[0], header=None).values.tolist()[1:]
articles = [article[0].replace(' ','_') for article in df]
articles.sort(reverse=True)

#### Breaking down the logic into small reusable function
- ```wiki_request``` : Helps in sending the actual request to the Wikipedia API and return the json response
- ```get_json``` : Calls wiki_request iteratively for all the articles
- ```store_json``` : Helper function to store the json object to a file
- ```generate_json``` : Takes a list of access types and return the combned data


In [11]:
def wiki_request(access, article):
    """
    Make a Wikipedia API request for a specific article and access type.

    Parameters:
    access (str): The type of access (e.g., "desktop", "mobile").
    article (str): The article title.

    Returns:
    dict: JSON response from the API.
    """
    article_encoded = urllib.parse.quote(article, safe='')
    template = {
        'access_type':access,
        'article':article_encoded
    }
    request_url = API.format(**template)
    response = requests.get(request_url, headers=HEADERS)
    return response.json()

def get_json(access):
    """
    Retrieve JSON data for a list of articles and a specific access type.

    Parameters:
    access (str): The type of access (e.g., "desktop", "mobile").
    articles (list): List of article titles.

    Returns:
    list: List of JSON responses for the given articles and access type.
    """
    json_res = []
    counter = 0
    for article in articles:
        if counter % 100 == 0:
            time.sleep(2)
        try:
            res = wiki_request(access=access, article=article)
            json_res.extend(res['items'])
        except Exception as e:
            print("Failed ", e)
        counter+=1
    return json_res


def store_json(view_json, file_name):
    """
    Store JSON data in a file.

    Parameters:
    view_json (list): List of JSON data to be stored.
    file_name (str): Name of the output JSON file.
    """
    json_string = json.dumps(view_json, indent=4)
    output_file_path = file_name
    with open(output_file_path, 'w') as output_file:
        output_file.write(json_string)


def generate_json(access, file_name):
    """
    Generate JSON data for multiple access types and store it in a file.

    Parameters:
    access (list): List of access types (e.g., ["desktop", "mobile"]).
    file_name (str): Name of the output JSON file.

    Returns:
    list: List of JSON responses for the given access types.
    """
    view_json = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks for each access type
        future_to_articles = {executor.submit(get_json, access_type): access_type for access_type in access}

        for future in concurrent.futures.as_completed(future_to_articles):
            access_type = future_to_articles[future]
            try:
                data = future.result()
                view_json.extend(data)
            except Exception as exc:
                print(f"Failed to retrieve data for {access_type}: {exc}")

    store_json(view_json, file_name)
    return view_json

Desktop view counts

In [None]:
desktop = generate_json(['desktop'],'academy_monthly_desktop_201507-202309.json')

Mobile (app and web) view counts combined

In [None]:
mobile = generate_json(['mobile-app','mobile-web'],'academy_monthly_cumulative_201507-202309.json')

Combine desktop and mobile results to get cumulative results

In [None]:
desktop.extend(mobile)

### Pre-Processing

Restructuring for to make further analysis easier

In [None]:
mobile_combined_views = defaultdict(int)
combined_views = defaultdict(int)
for obj in desktop:
    article = obj["article"]
    timestamp = obj["timestamp"]
    views = obj["views"]
    if obj["access"] != 'desktop':
      mobile_combined_views[(article, timestamp)] += views

    combined_views[(article, timestamp)] += views

combined_json_objects = []
for (article, timestamp), views in combined_views.items():
    combined_obj = {
        "project": "en.wikipedia",
        "article": article,
        "granularity": "monthly",
        "timestamp": timestamp,
        "access": "combined",
        "agent": "user",
        "views": views
    }
    combined_json_objects.append(combined_obj)

store_json(combined_json_objects,file_name='academy_monthly_cumulative_201507-202309.json')

mobile_combined_json_objects = []
for (article, timestamp), views in mobile_combined_views.items():
    combined_obj = {
        "project": "en.wikipedia",
        "article": article,
        "granularity": "monthly",
        "timestamp": timestamp,
        "access": "combined",
        "agent": "user",
        "views": views
    }
    mobile_combined_json_objects.append(combined_obj)

store_json(mobile_combined_json_objects,file_name='academy_monthly_mobile_201507-202309.json')