In [3]:
import os
import pandas as pd
import sys

from utils.crawl_anzsco import extract_anzsco_info, crawl_anzsco, flatten_tree

In [None]:
# import module from utils directory
# use init.py to make it a package in the future
# # ------------------------------------------------------------
# # caution: path[0] is reserved for script path (or '' in REPL)
# sys.path.insert(1, '/Users/samuelshamiri/projects/rag_anzsco/utils')

# from crawl_anzsco import extract_anzsco_info, crawl_anzsco, flatten_tree

In [None]:
# -------------------------
# Crawl ALL major groups (1–8)
# -------------------------
base = "https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/"
major_groups = [str(i) for i in range(1, 9)]
start_urls = [base + g for g in major_groups]

all_rows = []
for url in start_urls:
    print(f"Crawling {url} ...")
    tree = crawl_anzsco(url, max_depth=6)  # adjust depth if needed
    rows = flatten_tree(tree)
    all_rows.extend(rows)

# Convert to DataFrame
df = pd.DataFrame(all_rows)

Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/1 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/2 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/3 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/4 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5 ...
Crawling https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/6 ...
Crawling https://www.abs.gov

In [None]:
# Load the ANZSCO data
#df = pd.read_csv("../data/anzsco_full.csv")

In [5]:
# rename column to 'text' for splitting
df.rename(columns={'occupation_name': 'text'}, inplace=True)
                   
 # Use regex: ^(\d+) = digits at start, (.+) = rest of the string
df[["occupation_code", "occupation_name"]] = df["text"].str.extract(r"^(\d+)(.+)")

# Strip extra spaces in names
df["occupation_name"] = df["occupation_name"].str.strip()

# Drop text column
df.drop(columns=['text'], inplace=True)

# Remove all digits
df["path"] = df["path"].str.replace(r"\d+", "", regex=True).str.strip()

# Remove duplicates within the hierarchy
df["path"] = (
    df["path"]
    .str.split(" > ")
    .apply(lambda parts: " > ".join(dict.fromkeys(parts)))  # preserves order, removes duplicates
)

# Remove the constant prefix
prefix = "Indicative Skill Level:In Australia and New Zealand:"
df["skill_level"] = df["skill_level"].str.replace(prefix, "", regex=False).str.strip()


### Add extra data

In [23]:
# add OLS occupation list shortage status
ols_df = pd.read_csv("../data/OSL 2024.csv") 
#rename columns for merging
ols_df.rename(columns={'ANZSCO': 'occupation_code'}, inplace=True)
# select relevant columns
ols_df = ols_df[['occupation_code', 'National Shortage Rating']]

# Convert 'Numeric_Column' to string
ols_df['occupation_code'] = ols_df['occupation_code'].astype(str)


In [25]:
# Merge the OLS data with the ANZSCO data
anzsco_df = pd.merge(df, ols_df, on='occupation_code', how='left')


In [28]:
#anzsco_df.head()


### save

In [29]:
# Save to CSV
anzsco_df.to_csv("../data/anzsco_full.csv", index=False, encoding="utf-8")