In [1]:
import faiss, json, os, numpy as np, pandas as pd
from utils.rag_utils import get_ollama_embedding

In [2]:
# Load the ANZSCO data
df = pd.read_csv("../data/anzsco_full.csv")

In [3]:
# rename column to 'text' for splitting
df.rename(columns={'occupation_name': 'Occupation Name', 
                   'occupation_code':'Occupation Code',
                   'skill_level':'Skill Level',
                   'tasks':'Tasks',
                   'path':'Path'
                   }, inplace=True)

In [4]:
# Create text representation for each occupation
df["doc_text"] = (
    df["Occupation Name"].fillna("") + "\n"
    + df["Skill Level"].fillna("") + "\n"
    + df["Tasks"].fillna("") + "\n"
    + "Path: " + df["Path"].fillna("") + "\n"
    + "In shortage: " + df["National Shortage Rating"].fillna("")
)


In [5]:
#df["doc_text"].iloc[10]

In [6]:
# Build embeddings with Ollama
embeddings = [get_ollama_embedding(doc) for doc in df["doc_text"].tolist()]
embeddings = np.array(embeddings).astype("float32")


In [7]:
# Ensure the directory exists
os.makedirs("../db", exist_ok=True)

# Save embeddings
np.save("../db/anzsco_embeddings.npy", embeddings)

In [8]:
# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# Save FAISS index
faiss.write_index(index, "../db/anzsco.index")

In [9]:
# Save metadata
with open("../db/anzsco_meta.json", "w", encoding="utf-8") as f:
    json.dump(df.to_dict(orient="records"), f, ensure_ascii=False, indent=2)
