This notebook builds a unified dataset of global AI policy initiatives for the DSCI-511 final project.  
It starts with the OECD AI Policy Observatory data and then enriches it by adding major policy documents that are missing from the original dataset.

### What this notebook does:
- Loads the baseline `oecd_policies_clean.csv`.
- Scrapes or fetches metadata from official policy sources, including:
  - UNESCO — *Recommendation on the Ethics of Artificial Intelligence* (via UNESDOC API).
  - White House OSTP — *AI Bill of Rights* (via web scraping).
- Converts all scraped data into the **same schema** used by the OECD dataset.
- Cleans the final table by:
  - Standardizing column names,
  - Dropping temporary or duplicate fields,
  - Ensuring each policy initiative has a unique `policy_initiative_id`.
- Saves the updated, enriched dataset back to:
  `/content/drive/MyDrive/DSCI-511/data/policies/oecd_policies_clean.csv`

This processed dataset will be used later for visualization, pattern analysis, and the final project report.

In [None]:
#import the necessery libraries
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Fetching the data from UNESCO

In [None]:
# Initialize row accumulator
rows = []

# Robust session with retries
def make_session():
    s = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update({
        "User-Agent": "Priti-DSCI-Project/1.0 (class use; contact: student@example.com)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
    })
    return s

SESSION = make_session()

In [None]:
UNESDOC_API = "https://data.unesco.org/api/explore/v2.1/catalog/datasets/doc001/records"

def get_unesco_ai_recommendation():
    """
    Query UNESCO's official UNESDOC Catalogue API for the
    'Recommendation on the Ethics of Artificial Intelligence' record.
    """
    params = {
        "limit": 10,
        # full-text search over all fields
        "where": "search('Recommendation on the Ethics of Artificial Intelligence')"
    }
    r = SESSION.get(UNESDOC_API, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()

    if not data.get("results"):
        raise ValueError("No UNESDOC records returned for that search query.")

    # For Opendatasoft v2.1, results is usually a list of dicts where keys are field names
    rec = data["results"][0]

    # Inspect what we actually got back
    print("UNESDOC fields:", rec.keys())

    return rec


In [None]:
def unesco_record_to_row(rec):
    """
    Convert a UNESDOC API record to your OECD-style CSV row format.
    Column names are matched to the existing file so we don't create new ones.
    """

    # Field names in the API response (adjust if your rec.keys() differ)
    title_field       = "title"
    corp_author_field = "corporate_author"
    year_field        = "publication_year"
    language_field    = "language"
    other_langs_field = "also_available_in"
    licence_field     = "licence_type"
    doc_type_field    = "document_type"
    doc_code_field    = "document_code"
    collation_field   = "collation"

    title            = rec.get(title_field, "Recommendation on the Ethics of Artificial Intelligence")
    corporate_author = rec.get(corp_author_field, "UNESCO")
    year             = rec.get(year_field, "2022")
    language         = rec.get(language_field, "English")
    also_in          = rec.get(other_langs_field, "")
    licence          = rec.get(licence_field, "")
    doc_type         = rec.get(doc_type_field, "programme and meeting document")
    doc_code         = rec.get(doc_code_field, "SHS/BIO/PI/2021/1")
    collation        = rec.get(collation_field, "43 pages : illustrations")

    # Build description string from scraped metadata
    description = (
        f"Corporate author: {corporate_author}; "
        f"Document code: {doc_code}; "
        f"Collation: {collation}; "
        f"Also available in: {also_in}; "
        f"Year of publication: {year}; "
        f"Licence type: {licence}; "
        f"Type of document: {doc_type}."
    )

    unesco_url = "https://unesdoc.unesco.org/ark:/48223/pf0000381137"

    return {
        # --- core ID / naming fields  ---
        "policy_initiative_id": "unesco_rec_2021",
        "platform_url": unesco_url,
        "english_name": title,
        "original_name(s)": title,                  # to matche column name
        "country": "Global",
        "start_date": "2021",                       # file uses year here

        # --- descriptive / classification fields ---
        "description": description,
        "theme_area(s)": "Other AI policy initiatives",  # choose something consistent with your coding
        "theme(s)": "Ethics|Governance",                 # or just "Ethics" if you want to keep it simple

        # --- organisation / URLs ---
        "responsible_organisation(s)": "UNESCO",
        "public_access_url": unesco_url,

        # The remaining columns in your file (background, objective(s), etc.)
        # will just stay NaN / empty for this row unless you choose to fill them.
    }


In [None]:
# Load existing CSV
csv_path = "/content/drive/MyDrive/DSCI-511/data/policies/oecd_policies_clean.csv"
df = pd.read_csv(csv_path)

# Scrape record via API
unesco_raw = get_unesco_ai_recommendation()
unesco_row = unesco_record_to_row(unesco_raw)

# Correct dedupe/update logic
id_col = "policy_initiative_id"

if id_col in df.columns:
    mask = df[id_col] == unesco_row[id_col]

    if mask.any():
        # Update existing row
        for col, val in unesco_row.items():
            if col in df.columns:
                df.loc[mask, col] = val
        print("Updated existing UNESCO row.")
    else:
        # Append as new row
        df = pd.concat([df, pd.DataFrame([unesco_row])], ignore_index=True)
        print("Added new UNESCO row.")
else:
    # No ID column - simple append
    df = pd.concat([df, pd.DataFrame([unesco_row])], ignore_index=True)
    print("Appended UNESCO row (no policy_initiative_id column found).")

# Save updated CSV
df.to_csv(csv_path, index=False)
print("Saved updated CSV to:", csv_path)

UNESDOC fields: dict_keys(['uuid', 'url', 'ref_code', 'ref_code_0', 'ref_code_1', 'year', 'format', 'language', 'title', 'type', 'relation', 'conference', 'document_code', 'document_type', 'coverage', 'description', 'subject', 'creator', 'rights', 'rights_first', 'cover_url'])
Added new UNESCO row.
Saved updated CSV to: /content/drive/MyDrive/DSCI-511/data/policies/oecd_policies_clean.csv


In [None]:
# Official OSTP PDF link
ostp_pdf_url = "https://marketingstorageragrs.blob.core.windows.net/webfiles/Blueprint-for-an-AI-Bill-of-Rights.pdf"

save_path = "/content/drive/MyDrive/DSCI-511/data/policies/ostp_ai_bill_of_rights.pdf"

response = requests.get(ostp_pdf_url)

if response.status_code == 200:
    with open(save_path, "wb") as f:
        f.write(response.content)
    print("OSTP PDF downloaded successfully!")
else:
    print("Failed to download PDF. Status code:", response.status_code)


OSTP PDF downloaded successfully!


#### Creating a test file for OSTP data

In [None]:
url = "https://bidenwhitehouse.archives.gov/ostp/ai-bill-of-rights/"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract metadata
title = soup.find("h1").text.strip() if soup.find("h1") else "N/A"
description = soup.find("meta", {"name": "description"})
description = description["content"].strip() if description else "N/A"

date = soup.find("time")
date = date.text.strip() if date else "2022"   # fallback

# Build into dataframe
data = [{
    "source": "OSTP",
    "title": title,
    "publication_date": date,
    "url": url,
    "pdf_path": "data/policies/ostp_ai_bill_of_rights.pdf",
    "description": description
}]

df = pd.DataFrame(data)
df.to_csv("/content/drive/MyDrive/DSCI-511/data/policies/metadata_ostp.csv", index=False)

print(df)


  source                               title publication_date  \
0   OSTP  Blueprint for an AI Bill of Rights             2022   

                                                 url  \
0  https://bidenwhitehouse.archives.gov/ostp/ai-b...   

                                   pdf_path description  
0  data/policies/ostp_ai_bill_of_rights.pdf         N/A  


#### Scraping and adding OSTP data to the metadata

In [None]:
# -------------------------
# 1. Scrape OSTP page
# -------------------------

url = "https://bidenwhitehouse.archives.gov/ostp/ai-bill-of-rights/"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Metadata
title_tag = soup.find("h1")
title = title_tag.text.strip() if title_tag else "AI Bill of Rights"

desc_tag = soup.find("meta", {"name": "description"})
description = desc_tag["content"].strip() if desc_tag else ""

date_tag = soup.find("time")
date_text = date_tag.text.strip() if date_tag else "2022-10-04"  # official release date
# Your OECD file uses a YEAR in start_date, so extract just the year:
start_year = date_text[:4] if len(date_text) >= 4 else "2022"

# -------------------------
# 2. Build row in OECD schema
# -------------------------

new_row = {
    # ID & name fields – match your header exactly
    "policy_initiative_id": "ostp_ai_bill_of_rights_2022",
    "platform_url": url,
    "english_name": title,
    "original_name(s)": title,
    "country": "United States",
    "start_date": start_year,
    #"end_date": "2025",
    # Description & themes
    "description": f"AI Bill of Rights; description: {description}",
    "theme_area(s)": "Other AI policy initiatives",
    "theme(s)": "Ethics|Governance",

    # Organisation & URLs
    "responsible_organisation(s)": "White House OSTP",
    "public_access_url": url,

    # All other columns in your file (background, objective(s), etc.)
    # will stay NaN/empty for this row unless you explicitly fill them.
}

# -------------------------
# 3. Load your existing dataset
# -------------------------

csv_path = "/content/drive/MyDrive/DSCI-511/data/policies/oecd_policies_clean.csv"
df = pd.read_csv(csv_path)

# -------------------------
# 4. Update or append row based on policy_initiative_id
# -------------------------

id_col = "policy_initiative_id"

if id_col in df.columns:
    mask = df[id_col] == new_row[id_col]
    if mask.any():
        # Update existing OSTP record
        for col, val in new_row.items():
            if col in df.columns:
                df.loc[mask, col] = val
        print("Updated existing OSTP record.")
    else:
        # Append as new row
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
        print("Added new OSTP row.")
else:
    # No policy_initiative_id column → just append
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    print("Appended OSTP row (no policy_initiative_id column found).")

# -------------------------
# 5. Save back to file
# -------------------------

df.to_csv(csv_path, index=False)
print("Saved updated file to:", csv_path)
display(df.tail())


Added new OSTP row.
Saved updated file to: /content/drive/MyDrive/DSCI-511/data/policies/oecd_policies_clean.csv


Unnamed: 0,policy_initiative_id,platform_url,english_name,original_name(s),country,start_date,description,theme_area(s),theme(s),background,...,public_access_url,is_a_structural_reform_?,is_evaluated_?,ai_principle(s),ai_policy_area(s),policy_instrument_id,policy_instrument_type_category,policy_instrument_type,policy_instrument_name,policy_instrument_mini-field(s)
1881,2021/data/policyInitiatives/5133,https://oecd.ai/en/dashboards/policy-initiativ...,ATHENA MAGAZINE,Unknown,Belgium,1984.0,Publication of a free monthly magazine “Athena...,National AI Policies,National AI policies,Unknown,...,https://recherche.wallonie.be/en/home/kiosque/...,False,False,Unknown,Unknown,http://aipo.oecd.org/2021/data/policyInitiativ...,AI enablers and other incentives,Public awareness campaigns and civic participa...,Unknown,Medium: Printed publications
1882,2021/data/policyInitiatives/5133,https://oecd.ai/en/dashboards/policy-initiativ...,ATHENA MAGAZINE,Unknown,Belgium,1984.0,Publication of a free monthly magazine “Athena...,National AI Policies,National AI policies,Unknown,...,https://recherche.wallonie.be/en/home/kiosque/...,False,False,Unknown,Unknown,http://aipo.oecd.org/2021/data/policyInitiativ...,AI enablers and other incentives,Public awareness campaigns and civic participa...,Unknown,Unknown
1883,2021/data/policyInitiatives/5295,https://oecd.ai/en/dashboards/policy-initiativ...,AI R&D FRAMEWORK AND ACTIVITIES OF THE ISRAELI...,מסגרת פעילות של רשות החדשנות בתחומי בינה מלאכותית,Israel,2019.0,The Planned AI R&D Framework & Activities in p...,National AI Policies,National AI policies,"- Around 1,200 active AI companies.<br/>- 90% ...",...,https://innovationisrael.org.il/en/,True,False,Investing in AI R&D,Innovation|Science and technology,http://aipo.oecd.org/2021/data/policyInitiativ...,Financial support,Grants for business R&D and innovation,Unknown,"Type of activity: Basic research, Applied rese..."
1884,unesco_rec_2021,https://unesdoc.unesco.org/ark:/48223/pf000038...,Draft text of the Recommendation on the Ethics...,Draft text of the Recommendation on the Ethics...,Global,2021.0,Corporate author: UNESCO; Document code: None;...,Other AI policy initiatives,Ethics|Governance,,...,https://unesdoc.unesco.org/ark:/48223/pf000038...,,,,,,,,,
1885,ostp_ai_bill_of_rights_2022,https://bidenwhitehouse.archives.gov/ostp/ai-b...,Blueprint for an AI Bill of Rights,Blueprint for an AI Bill of Rights,United States,2022.0,AI Bill of Rights; description:,Other AI policy initiatives,Ethics|Governance,,...,https://bidenwhitehouse.archives.gov/ostp/ai-b...,,,,,,,,,
