# Setup and Imports

In [2]:
import datetime
import json
import pprint as pp

import boto3
import requests

In [3]:
USERNAME = "palbence"

# Extracting the data from Wikipedia API

In [4]:
# Setting the data parameter
DATE_PARAM = "2025-11-30"

date = datetime.datetime.strptime(DATE_PARAM, "%Y-%m-%d")

# Construct the API URL
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/{date.strftime('%Y/%m/%d')}"
print(f"Requesting REST API URL: {url}")

# Make the API request
wiki_server_response = requests.get(url, headers={"User-Agent": "curl/7.68.0"})
wiki_response_status = wiki_server_response.status_code
wiki_response_body = wiki_server_response.text

print(f"Wikipedia REST API Response body: {wiki_response_body[:500]}...")
print(f"Wikipedia REST API Response Code: {wiki_response_status}")

# Validate response
if wiki_response_status != 200:
    raise Exception(f"Received non-OK status code from Wiki Server: {wiki_response_status}")
print(f"Successfully retrieved Wikipedia data, content-length: {len(wiki_response_body)}")

Requesting REST API URL: https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/2025/11/30
Wikipedia REST API Response body: {"items":[{"project":"en.wikipedia","access":"all-access","year":"2025","month":"11","day":"30","articles":[{"article":"Main_Page","views":6285620,"rank":1},{"article":"Special:Search","views":819278,"rank":2},{"article":"Survivor_Series:_WarGames_(2025)","views":420997,"rank":3},{"article":"1989_Tiananmen_Square_protests_and_massacre","views":385238,"rank":4},{"article":"Lane_Kiffin","views":365126,"rank":5},{"article":"Stranger_Things_season_5","views":356813,"rank":6},{"article":"Google_Chrom...
Wikipedia REST API Response Code: 200
Successfully retrieved Wikipedia data, content-length: 56119


# Processing data into JSON format

In [39]:
# Parse the API response and extract top views
wiki_response_parsed = wiki_server_response.json()
all_views = wiki_response_parsed["items"][0]["articles"]

# Technically there is no need to sort as the API returns already sorted data, if we want to be sure:
from operator import itemgetter

top_views = sorted(all_views, key=itemgetter("views"), reverse=True)

TOP_N = 5

# Transform to JSON Lines format
current_time = datetime.datetime.now(datetime.timezone.utc)
json_lines = ""
for page in top_views[:TOP_N]:
    record = {
        "title": page["article"],
        "views": page["views"],
        "rank": page["rank"],
        "date": date.strftime("%Y-%m-%d"),
        "retrieved_at": current_time.replace(tzinfo=None).isoformat(),
    }
    json_lines += json.dumps(record) + "\n"

print(f"Transformed {len(top_views[:TOP_N])} records to JSON Lines")
print(f"First few lines:\n{json_lines[:500]}...")

Transformed 5 records to JSON Lines
First few lines:
{"title": "Main_Page", "views": 6285620, "rank": 1, "date": "2025-11-30", "retrieved_at": "2025-12-10T18:21:24.675044"}
{"title": "Special:Search", "views": 819278, "rank": 2, "date": "2025-11-30", "retrieved_at": "2025-12-10T18:21:24.675044"}
{"title": "Survivor_Series:_WarGames_(2025)", "views": 420997, "rank": 3, "date": "2025-11-30", "retrieved_at": "2025-12-10T18:21:24.675044"}
{"title": "1989_Tiananmen_Square_protests_and_massacre", "views": 385238, "rank": 4, "date": "2025-11-30", "retrie...


# Uploading JSON to S3

In [40]:
S3_WIKI_BUCKET = "palbence-wikidata"
s3 = boto3.client("s3")
s3_key = f"raw-views/raw-views-{date.strftime('%Y-%m-%d')}.json"
try:
    s3.put_object(
        Bucket=S3_WIKI_BUCKET,
        Key=s3_key,
        Body=json_lines,
    )

    print(f"✅ Successfully uploaded data to s3://{S3_WIKI_BUCKET}/{s3_key}")

except Exception as e:
    print(f"❌ Error uploading data: {str(e)}")


✅ Successfully uploaded data to s3://palbence-wikidata/raw-views/raw-views-2025-11-30.json


In [6]:
wiki_response_parsed = wiki_server_response.json()
all_views = wiki_response_parsed["items"][0]["articles"]
len(all_views)

1000