In [25]:
uk_parameter_embeddings = pd.read_csv(
    "parameter_embeddings.csv.gz", compression="gzip"
)
us_parameter_embeddings = pd.read_csv(
    "us_parameter_embeddings.csv.gz", compression="gzip"
)

In [26]:
uk_parameter_embeddings["country_id"] = "uk"
us_parameter_embeddings["country_id"] = "us"
parameter_embeddings = pd.concat(
    [uk_parameter_embeddings, us_parameter_embeddings]
)

In [31]:
parameter_embeddings

Unnamed: 0,name,json,parameter_embedding,country_id
0,gov.abolitions.AFCS,"{""description"": ""Set all values of Armed Force...","[-0.015623585321009159, 0.01033217553049326, -...",uk
1,gov.abolitions.BRMA_LHA_rate,"{""description"": ""Set all values of LHA rate to...","[-0.01876787841320038, 0.020921112969517708, -...",uk
2,gov.abolitions.BSP,"{""description"": ""Set all values of Bereavement...","[-0.018847180530428886, 0.019526733085513115, ...",uk
3,gov.abolitions.CB_HITC,"{""description"": ""Set all values of Child Benef...","[-0.024208562448620796, -0.0014698549639433622...",uk
4,gov.abolitions.CTC_child_element,"{""description"": ""Set all values of Child Tax C...","[-0.017787203192710876, 0.01393031980842352, -...",uk
...,...,...,...,...
12536,gov.usda.wic.value.POSTPARTUM,"{""description"": null, ""economy"": true, ""househ...","[0.004392489790916443, 0.0014703974593430758, ...",us
12537,gov.usda.wic.value.PREGNANT,"{""description"": null, ""economy"": true, ""househ...","[0.001542273093946278, -0.002406371058896184, ...",us
12538,gov.usda.wic.value.V,"{""description"": null, ""economy"": true, ""househ...","[-0.0031873411498963833, -0.012630472891032696...",us
12539,gov.usda.wic.value.VI,"{""description"": null, ""economy"": true, ""househ...","[-0.003992681857198477, -0.01098596677184105, ...",us


In [30]:
parameter_embeddings.to_csv(
    "param_embeddings.csv.gz", compression="gzip", index=False
)

In [22]:
import requests

metadata = requests.get(
    "https://api.policyengine.org/uk/economy/8985/over/1?time_period=2023&region=uk"
).json()

In [24]:
metadata["result"].keys()

dict_keys(['budget', 'decile', 'inequality', 'intra_decile', 'intra_wealth_decile', 'poverty', 'poverty_by_gender', 'wealth_decile'])

In [3]:
parameters = list(metadata["result"]["parameters"].values())
parameters = [p for p in parameters if p["type"] == "parameter"]
len(parameters)

12541

In [4]:
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
import json
from tqdm import tqdm

parameters = list(metadata["result"]["parameters"].values())
parameters = [p for p in parameters if p["type"] == "parameter"]

df = pd.DataFrame(
    {
        "name": [p["parameter"] for p in parameters],
        "json": [json.dumps(p) for p in parameters],
    }
)

_embed = []

for i, row in tqdm(df.iterrows()):
    _embed.append(get_embedding(row["json"], engine="text-embedding-ada-002"))

df["parameter_embedding"] = _embed

12541it [48:41,  4.29it/s]


In [5]:
df.to_csv("us_parameter_embeddings.csv.gz", compression="gzip")

In [6]:
df

Unnamed: 0,name,json,parameter_embedding
0,gov.abolitions.above_the_line_deductions,"{""description"": ""Set all values of Above-the-l...","[-0.02203972451388836, 0.008916514925658703, -..."
1,gov.abolitions.acp,"{""description"": ""Set all values of Affordable ...","[-0.019803417846560478, 0.010326764546334743, ..."
2,gov.abolitions.additional_medicare_tax,"{""description"": ""Set all values of Additional ...","[-0.025863468647003174, 0.0024864787701517344,..."
3,gov.abolitions.additional_standard_deduction,"{""description"": ""Set all values of Additional ...","[-0.02105577662587166, 0.01149937603622675, 0...."
4,gov.abolitions.adjusted_gross_income,"{""description"": ""Set all values of Adjusted gr...","[-0.0256198663264513, 0.0009419068228453398, -..."
...,...,...,...
12536,gov.usda.wic.value.POSTPARTUM,"{""description"": null, ""economy"": true, ""househ...","[0.004392489790916443, 0.0014703974593430758, ..."
12537,gov.usda.wic.value.PREGNANT,"{""description"": null, ""economy"": true, ""househ...","[0.001542273093946278, -0.002406371058896184, ..."
12538,gov.usda.wic.value.V,"{""description"": null, ""economy"": true, ""househ...","[-0.0031873411498963833, -0.012630472891032696..."
12539,gov.usda.wic.value.VI,"{""description"": null, ""economy"": true, ""househ...","[-0.003992681857198477, -0.01098596677184105, ..."


In [7]:
def search_functions(df, param_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(param_query, engine="text-embedding-ada-002")
    df["similarities"] = df.parameter_embedding.apply(
        lambda x: cosine_similarity(x, embedding)
    )

    res = df.sort_values("similarities", ascending=False).head(n)
    return res

In [8]:
def get_parameter(description):
    res = search_functions(df, description, n=1)
    return res.iloc[0].json

In [9]:
get_parameter("make the ctc refundable")

'{"description": "The IRS makes the Child Tax Credit fully refundable if this is true.", "economy": true, "household": true, "label": "Fully refundable CTC", "parameter": "gov.irs.credits.ctc.refundable.fully_refundable", "period": null, "type": "parameter", "unit": "bool", "values": {"2013-01-01": false, "2021-01-01": true, "2022-01-01": false}}'