In [39]:
import pandas as pd
from typing import List

import os
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

import ollama



In [36]:
# Load the ANZSCO data
df = pd.read_csv("../data/anzsco_full.csv")

In [37]:
# Create text representation for each occupation
df["doc_text"] = (
    df["occupation_name"].fillna("") + "\n"
    + df["skill_level"].fillna("") + "\n"
    + df["tasks"].fillna("") + "\n"
    + "Path: " + df["path"].fillna("")
)

In [38]:
df.head(2)

Unnamed: 0,occupation_name,skill_level,tasks,path,doc_text
0,1Managers,Indicative Skill Level:Most occupations in thi...,setting the overall direction and objectives o...,1Managers,1Managers\nIndicative Skill Level:Most occupat...
1,"11Chief Executives, General Managers and Legis...",Indicative Skill Level:In Australia and New Ze...,determining and setting the overall direction ...,"1Managers > 11Chief Executives, General Manage...","11Chief Executives, General Managers and Legis..."


In [6]:
# --------------------------------------------------------------
# Create a LanceDB database and table
# --------------------------------------------------------------

# Create a LanceDB database
db = lancedb.connect("../db/lancedb")


# Get the ollama embedding function
func = get_registry().get("ollama").create(name="nomic-embed-text")

In [7]:
# Define a simplified metadata schema
class ChunkMetadata(LanceModel):
    """
    You must order the fields in alphabetical order.
    This is a requirement of the Pydantic implementation.
    """

    occupation: str
    level: str
    tasks: str


# Define the main Schema
class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore
    metadata: ChunkMetadata

In [8]:
table = db.create_table("docling", schema=Chunks, mode="overwrite")

In [50]:
# # Define the main Schema
# class Chunks(LanceModel):
#     doc_text: str = func.SourceField()
#     vector: Vector(func.ndims()) = func.VectorField()  # type: ignore



# table = db.create_table("docling", schema=Chunks, mode="overwrite")

In [51]:
# # If you want to process the data while converting
# def create_custom_dict(row):
#     return {
#         'occupation': row['occupation_name'],
#         'skill_info': {
#             'level': row['skill_level'],
#             'tasks': row['tasks'],
#             'description': row['doc_text']
#         }
#     }

# # Apply to each row
# custom_dicts = [create_custom_dict(row) for _, row in df.iterrows()]

# print(custom_dicts)

In [52]:
# # If you want to process the data while converting
# def create_custom_dict(row):
#     return {
#         "text": row['doc_text'],
#         'metadata': {
#             'level': row['skill_level'].replace('Indicative Skill Level:', '').strip(),
#             'tasks': row['tasks'],
#             'occupation': row['occupation_name']
#         }
#     }

# # Apply to each row
# custom_dicts = [create_custom_dict(row) for _, row in df.iterrows()]

#print(custom_dicts)

In [9]:
def create_custom_dict(row):
    skill_level_str = str(row['skill_level'])  # Convert to string
    return {
        "text": row['doc_text'],
        'metadata': {
            'level': skill_level_str.replace('Indicative Skill Level:', '').strip(),
            'tasks': row['tasks'],
            'occupation': row['occupation_name']
        }
    }

# Apply to each row
custom_dicts = [create_custom_dict(row) for _, row in df.iterrows()]


In [11]:
custom_dicts[0:2]

[{'text': '1Managers\nIndicative Skill Level:Most occupations in this major group have a level of skill commensurate with the qualifications and experience outlined below.In Australia:Bachelor degree or higher qualification. At least five years of relevant experience may substitute for the formal qualification (ANZSCO Skill Level 1); orAQF Associate Degree, Advanced Diploma or Diploma, or at least three years of relevant experience (ANZSCO Skill Level 2)In New Zealand:Bachelor degree or higher qualification. At least five years of relevant experience may substitute for the formal qualification (ANZSCO Skill Level 1); orNZQF Diploma, or at least three years of relevant experience (ANZSCO Skill Level 2)In some instances relevant experience and/or on-the-job training may be required in addition to the formal qualification.Tasks Include:setting the overall direction and objectives of organisations and departments within organisationsformulating, administering and reviewing policy and legis

In [40]:

# Function to get embeddings using Ollama
def get_ollama_embedding(text, model_name="nomic-embed-text"):
    try:
        # Call the Ollama embeddings API
        response = ollama.embeddings(model=model_name, prompt=text)
        return response['embedding']
    except Exception as e:
        print(f"Error getting embedding for text: '{text}' - {e}")
        return None

In [41]:
df['embedding_vector'] = df['doc_text'].apply(lambda x: get_ollama_embedding(x))

In [43]:
table = db.create_table("docling", data=df, mode="overwrite")

In [42]:
# --------------------------------------------------------------
# Add the chunks to the table (automatically embeds the text)
# --------------------------------------------------------------

#table.add(custom_dicts)

In [44]:
table.search("what tasks for managers").limit(4).to_pandas()

RuntimeError: lance error: Invalid user input: Cannot perform full text search unless an INVERTED index has been created on at least one column, /Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/lance-index-0.35.0/src/scalar/inverted/query.rs:685:25