In [63]:
import requests
import pandas as pd
import json
from tqdm import tqdm
import numpy as np
from pathlib import Path
import h5py

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
folder = Path(".")

COUNTRIES = ["uk"]

def get_embedding(text):
    # Use Huggingface model to get embeddings
    return model.encode(text)

metadata_df = pd.read_csv(folder / "metadata.csv.gz", compression="gzip")
folder = Path(".")

metadata_df[["name", "country_id", "type"]].to_csv("metadata.csv.gz", index=False, compression="gzip")

with h5py.File("embeddings.h5", "r") as f:
    embeddings = f["embeddings"][:]

metadata_df["embedding"] = embeddings.tolist()

embedding_dimensions = len(embeddings[0])

# Use FAISS to create an index

import faiss

index_names = [
    "uk_parameters",
    "uk_variables",
    #"us_parameters",
    #"us_variables",
]

indexes = {
    name: faiss.IndexFlatL2(embedding_dimensions)
    for name in index_names
}

names = {
    name: []
    for name in index_names
}

for metadata_type in ["parameter", "variable"]:
    for country_id in COUNTRIES:
        name = f"{country_id}_{metadata_type}s"
        index = indexes[name]
        metadata = metadata_df[(metadata_df["type"] == metadata_type) * (metadata_df["country_id"] == country_id)]
        embeddings = np.array(metadata["embedding"].tolist()).reshape(-1, embedding_dimensions)
        index.add(embeddings)
        names[name] = metadata["name"].tolist()

def search(query, index_name, k=10):
    embedding = get_embedding(query)
    index = indexes[index_name]
    distances, indices = index.search(np.array([embedding]), k)
    return [names[index_name][i] for i in indices[0]]

In [64]:
search("basic rate", "uk_parameters")

['gov.hmrc.income_tax.rates.uk[0].rate',
 'gov.hmrc.income_tax.rates.dividends[0].rate',
 'gov.hmrc.stamp_duty.residential.purchase.main.subsequent[0].rate',
 'gov.hmrc.stamp_duty.residential.purchase.main.first.rate[0].rate',
 'gov.wra.land_transaction_tax.residential.primary[0].rate',
 'gov.revenue_scotland.lbtt.non_residential[0].rate',
 'gov.hmrc.stamp_duty.non_residential.purchase[0].rate',
 'gov.wra.land_transaction_tax.non_residential[0].rate',
 'gov.revenue_scotland.lbtt.residential.first_time_buyer_rate[0].rate',
 'gov.hmrc.stamp_duty.residential.purchase.main.first.rate[1].rate']