# Get Licenses info

source: https://spdx.org/licenses/

In [1]:
import httpx
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
url = "https://spdx.org/licenses/"
response = httpx.get(url)
page = BeautifulSoup(response.text, "html.parser")
table = page.find("table")

In [4]:
# Deal with the headers

headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]

# rename to more pythonic names
columns_mapping = {
    "Full name": "full_name",
    "Identifier": "identifier",
    "FSF Free/Libre?": "fsf_free_libre",
    "OSI Approved?": "osi_approved",
}
columns = [columns_mapping.get(h, h) for h in headers]
columns

['full_name', 'identifier', 'fsf_free_libre', 'osi_approved']

In [None]:
# Deal with data rows

rows = []
base_url = "https://spdx.org/licenses/"

for tr in table.select("tbody tr"):
    row = [td.get_text(strip=True) for td in tr.find_all("td")]
    a_tag = tr.find("a", href=True)

    if a_tag:
        link = a_tag["href"].lstrip("./")
        link = base_url + link if not link.startswith("http") else url
    else:
        link = None

    row.append(link)
    rows.append(row)


In [None]:
df = pd.DataFrame(rows, columns=columns + ["url"])

# Cast fsf_free_libre and osi_approved to boolean
df["fsf_free_libre"] = df["fsf_free_libre"].apply(lambda x: x == "Y")
df["osi_approved"] = df["osi_approved"].apply(lambda x: x == "Y")

# Reorder a bit
df = df[["identifier", "full_name", "url", "fsf_free_libre", "osi_approved"]]
df.head()

Unnamed: 0,identifier,full_name,url,fsf_free_libre,osi_approved
0,0BSD,BSD Zero Clause License,https://spdx.org/licenses/0BSD.html,False,True
1,3D-Slicer-1.0,3D Slicer License v1.0,https://spdx.org/licenses/3D-Slicer-1.0.html,False,False
2,AAL,Attribution Assurance License,https://spdx.org/licenses/AAL.html,False,True
3,Abstyles,Abstyles License,https://spdx.org/licenses/Abstyles.html,False,False
4,AdaCore-doc,AdaCore Doc License,https://spdx.org/licenses/AdaCore-doc.html,False,False


In [None]:
# Dump to jsonl
from pathlib import Path

data_dir = Path("pelican_data_loader/data")
data_dir.mkdir(parents=True, exist_ok=True)
df.to_json(data_dir / "licenses.jsonl", orient="records", lines=True)
