In [None]:
# import asyncio
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import json
from uuid import uuid4
import logging

import networkx as nx
import pandas as pd
import httpx
from fake_useragent import UserAgent

In [None]:
local_data_directory = Path("../local_data")
packages_metadata_directory = local_data_directory / "metadata"
packages_metadata_directory.mkdir(exist_ok=True)

In [42]:
user_agent_generator = UserAgent()

def generate_user_agent() -> str:
    return user_agent_generator.random

In [43]:
df = pd.read_csv("https://hugovk.github.io/top-pypi-packages/top-pypi-packages.csv")
# reference to dataset source:
# - https://github.com/hugovk/top-pypi-packages
# - https://github.com/Robert-96/top-pypi-packages

display(len(df), df.columns, df, df.head(), df.tail(), df.describe())

15000

Index(['download_count', 'project'], dtype='object')

Unnamed: 0,download_count,project
0,1201718923,boto3
1,926039357,urllib3
2,893777784,botocore
3,834670127,requests
4,825048727,certifi
...,...,...
14995,36317,discord-py-self
14996,36314,odecloud
14997,36310,gate-api
14998,36307,unique-log-filter


Unnamed: 0,download_count,project
0,1201718923,boto3
1,926039357,urllib3
2,893777784,botocore
3,834670127,requests
4,825048727,certifi


Unnamed: 0,download_count,project
14995,36317,discord-py-self
14996,36314,odecloud
14997,36310,gate-api
14998,36307,unique-log-filter
14999,36292,asciidag


Unnamed: 0,download_count
count,15000.0
mean,5274020.0
std,35079360.0
min,36292.0
25%,65945.25
50%,177321.5
75%,673855.2
max,1201719000.0


In [45]:
pypi_packages = df["project"].to_list()

In [48]:
logging.basicConfig(
    level=logging.INFO,
    encoding="utf-8",
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", # TODO: change format of logging
    handlers=[
        logging.FileHandler("../local_data/metadata.log") # TODO: change to streamHandler and json logger
    ]
)
logger = logging.getLogger(__name__)

# TODO:
# - def get_list_of_packages(url: str):
# - write to csv file and read in pandas (instead of reading from pandas firsthand)
# - change to async loop
def get_package_metadata(package_name: str) -> None:
    """
    package_name is unique name within pypi
    - according to https://docs.pypi.org/api/json/
    """

    headers = {
        "user-agent": generate_user_agent()
    }
    with httpx.Client(headers=headers) as client:
        try:
            response = client.get(f"https://pypi.org/pypi/{package_name}/json")
            if not response.ok:
                message = f"Cannot access the resource or not available for '{package_name}'"
                logger.error(message)
                raise Exception(message)
        except httpx.HTTPError as e:
            message = f"Cannot access the resource or not available for '{package_name}'"
            logger.error(message)
            raise Exception(message)
    try:
        metadata = json.dump(response.json())
    except Exception as e:
        message = f"Cannot convert the resource metadata to JSON for '{package_name}'"
        logger.error(message)
        raise Exception(message)
    try:
        with open(f"../local_data/packages/{str(uuid4())}", "w") as file:
            file.write(response.json())
    except OSError as e:
        message = f"Cannot save package_metadata to file for '{package_name}'"
        logger.error(message)
        raise Exception(message)
    message = f"The resource metadata is saved successfully for '{package_name}'"
    logger.info(message)

# TODO: handle errors here via Future object instead
# and for that error retry again
with ProcessPoolExecutor() as executor:
    results = executor.map(get_package_metadata, pypi_packages)

2025-09-11 21:35:09,877 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/boto3/json "HTTP/1.1 200 OK"
2025-09-11 21:35:09,883 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/urllib3/json "HTTP/1.1 200 OK"
2025-09-11 21:35:09,883 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/requests/json "HTTP/1.1 200 OK"
2025-09-11 21:35:09,898 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/typing-extensions/json "HTTP/1.1 200 OK"
2025-09-11 21:35:09,952 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/certifi/json "HTTP/1.1 200 OK"
2025-09-11 21:35:09,993 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/botocore/json "HTTP/1.1 200 OK"
2025-09-11 21:35:10,177 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/charset-normalizer/json "HTTP/1.1 200 OK"
2025-09-11 21:35:10,180 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/grpcio-status/json "HTTP/1.1 200 OK"
2025-09-11 21:35:10,269 - httpx - INFO - HTTP Request: GET https://pypi.org/p

KeyboardInterrupt: 

2025-09-11 21:35:42,626 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/simple-salesforce/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,627 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/libcst/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,628 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/types-s3transfer/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,659 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/boto3-stubs/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,658 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/unidecode/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,672 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/zope-event/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,915 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/pyee/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,932 - httpx - INFO - HTTP Request: GET https://pypi.org/pypi/great-expectations/json "HTTP/1.1 200 OK"
2025-09-11 21:35:42,984 - httpx - INFO - HTTP Request: GET https://py

In [None]:
# combine all jsons to one list
# then open in pandas