In [None]:
import os
import numpy as np 
import pandas as pd
from typing import List

# Docling imports (use the documented converter; chunker import path may vary)
try:
    # recommended documented usage
    from docling.document_converter import DocumentConverter
    # HybridChunker path - docling may expose under docling.chunking
    from docling.chunking import HybridChunker
except Exception as e:
    raise RuntimeError(
        "Failed to import docling components. Make sure docling is installed and your script is not named 'docling.py'."
    ) from e
    
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

from transformers import AutoTokenizer
from ollama import chat as ollama_chat



  from .autonotebook import tqdm as notebook_tqdm


False

### Read url's df

In [2]:
url_df = pd.read_csv('../data/anzsco_22_links.csv')
url_df.head()

Unnamed: 0,url
0,https://www.abs.gov.au/statistics/classificati...
1,https://www.abs.gov.au/statistics/classificati...
2,https://www.abs.gov.au/statistics/classificati...
3,https://www.abs.gov.au/statistics/classificati...
4,https://www.abs.gov.au/statistics/classificati...


### Chunking with docling
- first method: directly on the url
- second method: download the required info then use docling

#### method 1

In [None]:
# Get first url for testing
url = url_df['url'][0]
url

'https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5/59/599/5994'

In [4]:
converter = DocumentConverter()
# converter.convert returns a ConversionResult; .document is the docling document
conv = converter.convert(url)
doc = conv.document

2025-09-21 11:25:23,451 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-09-21 11:25:23,466 - INFO - Going to convert document batch...
2025-09-21 11:25:23,468 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-09-21 11:25:23,473 - INFO - Loading plugin 'docling_defaults'
2025-09-21 11:25:23,475 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-09-21 11:25:23,475 - INFO - Processing document 5994
2025-09-21 11:25:23,498 - INFO - Finished converting document 5994 in 0.14 sec.


In [6]:
#print(doc.export_to_markdown())

#### method 2

In [None]:
iimport requests
from bs4 import BeautifulSoup

def extract_anzsco_info(url: str) -> dict:
    """
    Extracts Occupation name, Skill level, Tasks, and Subcategories from an ANZSCO occupation page.
    Works for both unit groups and higher-level groups.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Occupation name
    occupation_name = soup.find("h1")
    if not occupation_name:
        occupation_name = soup.find("title")
    occupation_name = occupation_name.get_text(strip=True) if occupation_name else None

    # Skill level
    skill_level = None
    for p in soup.find_all(["p", "li", "div"]):
        text = p.get_text(strip=True)
        if text.startswith("Skill Level") or text.startswith("Indicative Skill Level"):
            skill_level = text
            break

    # Tasks
    tasks = []
    task_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                            and "Tasks Include" in tag.get_text())
    if task_header:
        ul = task_header.find_next("ul")
        if ul:
            tasks = [li.get_text(strip=True) for li in ul.find_all("li")]
        else:
            # Sometimes tasks are inline paragraphs
            sibs = task_header.find_all_next("p", limit=6)
            for s in sibs:
                if s.get_text(strip=True):
                    tasks.append(s.get_text(strip=True))

    # Subcategories
    subcategories = []
    subcat_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                              and "Subcategories" in tag.get_text())
    if subcat_header:
        # Collect links after the header
        for a in subcat_header.find_all_next("a", href=True):
            text = a.get_text(strip=True)
            if text and text[0].isdigit():  # occupation codes start with digits
                subcategories.append(text)
            else:
                break

    return {
        "occupation_name": occupation_name,
        "skill_level": skill_level,
        "tasks": tasks,
        "subcategories": subcategories
    }

In [None]:
# Example usage
url = "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5/51/511/5111"
info = extract_anzsco_info(url)

print("Occupation Name:", info["occupation_name"])
print("Skill Level:", info["skill_level"])
print("Tasks:")
for t in info["tasks"]:
    print("-", t)

print("\nSubcategories:")
for s in info["subcategories"]:
    print("-", s)


Occupation Name: 5111Contract, Program and Project Administrators
Skill Level: Indicative Skill Level:Most occupations in this unit group have a level of skill commensurate with the qualifications and experience outlined below.In Australia:AQF Associate Degree, Advanced Diploma or Diploma (ANZSCO Skill Level 2)In New Zealand:NZQF Diploma (ANZSCO Skill Level 2)At least three years of relevant experience may substitute for the formal qualifications listed above. In some instances relevant experience and/or on-the-job training may be required in addition to the formal qualification.Tasks Include:developing, reviewing and negotiating variations to contracts, programs, projects and servicesresponding to inquiries and resolving problems concerning contracts, programs, projects, services provided, and persons affectedmanaging paperwork associated with contracts, programs, projects and services providedworking with Project Managers, Architects, Engineering Professionals, owners and others to e

In [29]:
info

{'occupation_name': '5111Contract, Program and Project Administrators',
 'skill_level': "Indicative Skill Level:Most occupations in this unit group have a level of skill commensurate with the qualifications and experience outlined below.In Australia:AQF Associate Degree, Advanced Diploma or Diploma (ANZSCO Skill Level 2)In New Zealand:NZQF Diploma (ANZSCO Skill Level 2)At least three years of relevant experience may substitute for the formal qualifications listed above. In some instances relevant experience and/or on-the-job training may be required in addition to the formal qualification.Tasks Include:developing, reviewing and negotiating variations to contracts, programs, projects and servicesresponding to inquiries and resolving problems concerning contracts, programs, projects, services provided, and persons affectedmanaging paperwork associated with contracts, programs, projects and services providedworking with Project Managers, Architects, Engineering Professionals, owners and o

In [37]:
# -------------------------
# Loop through multiple URLs
# -------------------------
urls = [
    "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5/51/511/5111",
    "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/7"
]

results = [extract_anzsco_info(u) for u in urls]

# Example: print results
for r in results:
    print("\n---")
    print("Occupation Name:", r["occupation_name"])
    print("Skill Level:", r["skill_level"])
    print("Tasks:", r["tasks"][:3], "...")  # show first 3 tasks
    print("Subcategories:", r["subcategories"])


---
Occupation Name: 5111Contract, Program and Project Administrators
Skill Level: Indicative Skill Level:Most occupations in this unit group have a level of skill commensurate with the qualifications and experience outlined below.In Australia:AQF Associate Degree, Advanced Diploma or Diploma (ANZSCO Skill Level 2)In New Zealand:NZQF Diploma (ANZSCO Skill Level 2)At least three years of relevant experience may substitute for the formal qualifications listed above. In some instances relevant experience and/or on-the-job training may be required in addition to the formal qualification.Tasks Include:developing, reviewing and negotiating variations to contracts, programs, projects and servicesresponding to inquiries and resolving problems concerning contracts, programs, projects, services provided, and persons affectedmanaging paperwork associated with contracts, programs, projects and services providedworking with Project Managers, Architects, Engineering Professionals, owners and others

In [39]:
import requests
from bs4 import BeautifulSoup

def extract_anzsco_info(url: str) -> dict:
    """
    Extracts Occupation name, Skill level, Tasks, and Subcategories from an ANZSCO occupation page.
    Works for both unit groups and higher-level groups.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Occupation name
    occupation_name = soup.find("h1")
    if not occupation_name:
        occupation_name = soup.find("title")
    occupation_name = occupation_name.get_text(strip=True) if occupation_name else None

    # Skill level
    skill_level = None
    for p in soup.find_all(["p", "li", "div"]):
        text = p.get_text(strip=True)
        if text.startswith("Skill Level") or text.startswith("Indicative Skill Level"):
            skill_level = text
            break

    # Tasks
    tasks = []
    task_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                            and "Tasks Include" in tag.get_text())
    if task_header:
        ul = task_header.find_next("ul")
        if ul:
            tasks = [li.get_text(strip=True) for li in ul.find_all("li")]
        else:
            sibs = task_header.find_all_next("p", limit=6)
            for s in sibs:
                if s.get_text(strip=True):
                    tasks.append(s.get_text(strip=True))

    # Subcategories (links to child pages)
    subcategories = []
    subcat_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                              and "Subcategories" in tag.get_text())
    if subcat_header:
        for a in subcat_header.find_all_next("a", href=True):
            text = a.get_text(strip=True)
            if text and text[0].isdigit():  # occupation codes start with digits
                subcategories.append({
                    "name": text,
                    "url": "https://www.abs.gov.au" + a["href"]
                })
            else:
                break

    return {
        "occupation_name": occupation_name,
        "skill_level": skill_level,
        "tasks": tasks,
        "subcategories": subcategories
    }

def crawl_anzsco(url: str, depth: int = 0, max_depth: int = 10) -> dict:
    """
    Recursively crawls ANZSCO pages, building a tree of occupations.
    
    Parameters:
        url (str): Starting ABS ANZSCO URL.
        depth (int): Current recursion depth.
        max_depth (int): Safety limit to prevent infinite recursion.
    
    Returns:
        dict: Nested tree of occupation info.
    """
    node = extract_anzsco_info(url)

    if depth < max_depth and node["subcategories"]:
        children = []
        for sub in node["subcategories"]:
            try:
                child = crawl_anzsco(sub["url"], depth + 1, max_depth)
                children.append(child)
            except Exception as e:
                print(f"Failed to crawl {sub['url']}: {e}")
        node["children"] = children
    else:
        node["children"] = []

    return node


In [None]:
# -------------------------
# Example usage save to JSON
# -------------------------
start_urls = [
    "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/7",
    "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5/51/511/5111"
]

results = [crawl_anzsco(u, max_depth=5) for u in start_urls]

# Print a summary
import json
print(json.dumps(results, indent=2))


In [None]:
# -------------------------
# dump results as JSON
# -------------------------

# Save to JSON file
with open("anzsco_tree.json", "w", encoding="utf-8") as f:
    json.dump(tree, f, indent=2, ensure_ascii=False)

print("Saved ANZSCO tree to anzsco_tree.json")



In [40]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# -------------------------
# Core extraction function
# -------------------------
def extract_anzsco_info(url: str) -> dict:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Occupation name
    occupation_name = soup.find("h1")
    if not occupation_name:
        occupation_name = soup.find("title")
    occupation_name = occupation_name.get_text(strip=True) if occupation_name else None

    # Skill level
    skill_level = None
    for p in soup.find_all(["p", "li", "div"]):
        text = p.get_text(strip=True)
        if text.startswith("Skill Level") or text.startswith("Indicative Skill Level"):
            skill_level = text
            break

    # Tasks
    tasks = []
    task_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                            and "Tasks Include" in tag.get_text())
    if task_header:
        ul = task_header.find_next("ul")
        if ul:
            tasks = [li.get_text(strip=True) for li in ul.find_all("li")]
        else:
            sibs = task_header.find_all_next("p", limit=6)
            for s in sibs:
                if s.get_text(strip=True):
                    tasks.append(s.get_text(strip=True))

    # Subcategories
    subcategories = []
    subcat_header = soup.find(lambda tag: tag.name in ["p","strong","h3","h4"] 
                              and "Subcategories" in tag.get_text())
    if subcat_header:
        for a in subcat_header.find_all_next("a", href=True):
            text = a.get_text(strip=True)
            if text and text[0].isdigit():
                subcategories.append({
                    "name": text,
                    "url": "https://www.abs.gov.au" + a["href"]
                })
            else:
                break

    return {
        "occupation_name": occupation_name,
        "skill_level": skill_level,
        "tasks": tasks,
        "subcategories": subcategories
    }

# -------------------------
# Recursive crawler
# -------------------------
def crawl_anzsco(url: str, depth: int = 0, max_depth: int = 10) -> dict:
    node = extract_anzsco_info(url)

    if depth < max_depth and node["subcategories"]:
        children = []
        for sub in node["subcategories"]:
            try:
                child = crawl_anzsco(sub["url"], depth + 1, max_depth)
                children.append(child)
            except Exception as e:
                print(f"Failed to crawl {sub['url']}: {e}")
        node["children"] = children
    else:
        node["children"] = []

    return node

# -------------------------
# Flatten tree into rows
# -------------------------
def flatten_tree(node, parent_path=None):
    if parent_path is None:
        parent_path = []
    
    current_path = parent_path + [node.get("occupation_name")]
    
    row = {
        "occupation_name": node.get("occupation_name"),
        "skill_level": node.get("skill_level"),
        "tasks": "; ".join(node.get("tasks", [])),
        "path": " > ".join([p for p in current_path if p])
    }
    
    rows = [row]
    
    for child in node.get("children", []):
        rows.extend(flatten_tree(child, current_path))
    
    return rows

# -------------------------
# Crawl ALL major groups (1–8)
# -------------------------
base = "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/"
major_groups = [str(i) for i in range(1, 9)]
start_urls = [base + g for g in major_groups]

all_rows = []
for url in start_urls:
    print(f"Crawling {url} ...")
    tree = crawl_anzsco(url, max_depth=6)  # adjust depth if needed
    rows = flatten_tree(tree)
    all_rows.extend(rows)

# Convert to DataFrame
df = pd.DataFrame(all_rows)

# Save to CSV (optional)
df.to_csv("anzsco_full_flat.csv", index=False, encoding="utf-8")

print("✅ Completed crawl. DataFrame shape:", df.shape)
print(df.head(10))


Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/1 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/2 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/3 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/4 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/6 ...
Crawling https://www.abs.gov

In [41]:
df.head()

Unnamed: 0,occupation_name,skill_level,tasks,path
0,1Managers,Indicative Skill Level:Most occupations in thi...,setting the overall direction and objectives o...,1Managers
1,"11Chief Executives, General Managers and Legis...",Indicative Skill Level:In Australia and New Ze...,determining and setting the overall direction ...,"1Managers > 11Chief Executives, General Manage..."
2,"111Chief Executives, General Managers and Legi...",Indicative Skill Level:In Australia and New Ze...,determining and setting the overall direction ...,"1Managers > 11Chief Executives, General Manage..."
3,1111Chief Executives and Managing Directors,Indicative Skill Level:In Australia and New Ze...,"determining objectives, strategies, policies a...","1Managers > 11Chief Executives, General Manage..."
4,1112General Managers,Indicative Skill Level:In Australia and New Ze...,"planning policy, and setting standards and obj...","1Managers > 11Chief Executives, General Manage..."
