In [1]:
import faiss, json, numpy as np, pandas as pd
from utils.rag_utils import get_ollama_embedding

In [2]:
# Load the ANZSCO data
df = pd.read_csv("../data/anzsco_full.csv")

In [4]:
# Create text representation for each occupation
df["doc_text"] = (
    df["occupation_name"].fillna("") + "\n"
    + df["skill_level"].fillna("") + "\n"
    + df["tasks"].fillna("") + "\n"
    + "Path: " + df["path"].fillna("")
)

In [6]:
# Build embeddings with Ollama
embeddings = [get_ollama_embedding(doc) for doc in df["doc_text"].tolist()]
embeddings = np.array(embeddings).astype("float32")


In [None]:
src/01_data_processing.ipynb
/Users/samuelshamiri/projects/rag_anzsco/db
db


In [14]:
import os

# Ensure the directory exists
os.makedirs("../db", exist_ok=True)

# Save embeddings
np.save("../db/anzsco_embeddings.npy", embeddings)

In [15]:
# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# Save FAISS index
faiss.write_index(index, "../db/anzsco.index")

In [16]:
# Save metadata
with open("../db/anzsco_meta.json", "w", encoding="utf-8") as f:
    json.dump(df.to_dict(orient="records"), f, ensure_ascii=False, indent=2)
