In [9]:
# Data Acquisition for Wikipedia Rare Disease Articles

## 1. Import necessary libraries
import json
import time
import urllib.parse
import requests
import pandas as pd
from datetime import datetime, timedelta

In [10]:
## 2. Define constants
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'
API_LATENCY_ASSUMED = 0.002
API_THROTTLE_WAIT = (1.0/100.0) - API_LATENCY_ASSUMED

REQUEST_HEADERS = {
    'User-Agent': '<pgupta1@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024',
}

ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project": "en.wikipedia.org",
    "access": "all-access",  # We'll modify this for different access types
    "agent": "user",
    "article": "",
    "granularity": "monthly",
    "start": "2015070100",
    "end": "2024093000"  # Adjusted to match the assignment requirements
}

In [11]:
## 3. Load and process CSV file
def load_rare_disease_articles(csv_filename):
    df = pd.read_csv(csv_filename, names=['disease', 'pageid', 'url'])
    return df['disease'].tolist()

RARE_DISEASE_ARTICLES = load_rare_disease_articles('wikiURL_data/rare-disease_cleaned.AUG.2024.csv')

In [12]:
## 4. Define helper functions
def request_pageviews_per_article(article_title, access_type):
    params = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE.copy()
    params['article'] = urllib.parse.quote(article_title.replace(' ', '_'))
    params['access'] = access_type
    
    request_url = API_REQUEST_PAGEVIEWS_ENDPOINT + API_REQUEST_PER_ARTICLE_PARAMS.format(**params)
    
    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=REQUEST_HEADERS)
        return response.json()
    except Exception as e:
        print(f"Error fetching data for {article_title}: {e}")
        return None

def process_pageviews_data(data, access_type):
    if not data or 'items' not in data:
        return {}
    
    return {item['timestamp'][:6]: item['views'] for item in data['items']}

In [13]:
## 5. Data Acquisition
def fetch_pageviews_data(article_list, access_types):
    all_data = {}
    for article in article_list:
        article_data = {}
        for access_type in access_types:
            raw_data = request_pageviews_per_article(article, access_type)
            processed_data = process_pageviews_data(raw_data, access_type)
            article_data[access_type] = processed_data
        all_data[article] = article_data
    return all_data

In [14]:
## 6. Create JSON files
def create_json_files(data):
    # Mobile data (sum of mobile-web and mobile-app)
    mobile_data = {article: {month: data[article]['mobile-web'].get(month, 0) + data[article]['mobile-app'].get(month, 0) 
                             for month in set(data[article]['mobile-web']) | set(data[article]['mobile-app'])}
                   for article in data}
    
    with open('Json_data/rare-disease_monthly_mobile_201507-202409.json', 'w') as f:
        json.dump(mobile_data, f, indent=2)
    
    # Desktop data
    desktop_data = {article: data[article]['desktop'] for article in data}
    with open('Json_data/rare-disease_monthly_desktop_201507-202409.json', 'w') as f:
        json.dump(desktop_data, f, indent=2)
    
    # Cumulative data
    cumulative_data = {article: {month: mobile_data[article].get(month, 0) + desktop_data[article].get(month, 0)
                                 for month in set(mobile_data[article]) | set(desktop_data[article])}
                       for article in data}
    with open('Json_data/rare-disease_monthly_cumulative_201507-202409.json', 'w') as f:
        json.dump(cumulative_data, f, indent=2)

create_json_files(pageviews_data)

print("Data acquisition complete. JSON files have been created in the output directory.")

Data acquisition complete. JSON files have been created in the output directory.
