In [3]:
import json
import os

dev_file = 'S:/Varun_Projects/sqlcopilot/data/sub_sampled_bird_dev_set.json'

with open(dev_file, 'r') as file:
    dev_set = json.load(file)

In [4]:
from src.info_retrieval.utils import extract_keyword
from src.model.inference_endpoints import LLM

from src.model.inference_endpoints import LLM
from openai import OpenAI

import httpx
from dotenv import load_dotenv
load_dotenv()

model = 'tgi'

client = OpenAI(
    base_url=os.environ['BASE_URL_MIXTRAL'],
    api_key=os.environ['API_KEY_MIXTRAL']
)

llm = LLM(
    client = client,
    model = model, 
    gen_params = {
        'STREAM': False,
        'TEMPERATURE': 0, 
    }
) # Need to change this function

question = "In which city can you find the school in the state of California with the lowest latitude coordinates and what is its lowest grade? Indicate the school name."
hint = "State of California refers to state = 'CA'"
print(f"question: {question}")

kw = extract_keyword(
    question = question,
    hint = hint,
    few_shot_examples = """Question: Whats the fastest lap time ever in a race for Lewis Hamilton?
    ["FASTEST LAP TIME", "LEWIS HAMILTON"]""",
    model=llm
)

KeyError: 'BASE_URL'

In [3]:
kw

['city',
 'lowest latitude coordinates',
 'school',
 'state of California',
 'CA',
 'lowest grade']

In [1]:
from src.info_retrieval.lsh import *

database =  "california_schools"

lsh, minhashes = load_db_lsh(
    db_directory_path = f"{os.environ['DATABASE_ROOT_PATH']}/{database}"
)

similar_values = []
for keyword in kw:
    similar_values.append(
        query_lsh(
            lsh=lsh,
            minhashes=minhashes,
            keyword=keyword
        )
    )

all_values = []
for similar_value in similar_values:
    for k, values in similar_value.items():
        all_values.extend(values.values())

all_values = [item for sublist in all_values for item in sublist]
all_values = list(set(all_values))

from src.info_retrieval.utils import semantic_rerank
from src.model.embedding import Embedding

embed_obj = Embedding() # Need to change this function

semantic_values = []
for keyword in kw: 
    semantic_values.append(
        semantic_rerank(
            embed_obj = embed_obj,
            strings = all_values,
            keyword = keyword
        )
    )

NameError: name 'os' is not defined

In [5]:
def filter_dict_by_values(data, values_to_find):
    temp = data
    values_to_find_set = set(values_to_find)
    for table_name, columns in temp.items():
        for column_name, column_values in columns.items():
            columns[column_name] = [value for value in column_values if value in values_to_find_set]

    return temp

filtered_similar_values = []
for similars, semantics in zip(similar_values, semantic_values):
    filtered_similar_values.append(
        filter_dict_by_values(
            data=similars,
            values_to_find=semantics
        )
    )

In [6]:
semantic_values

[['The City',
  'California City',
  '21',
  'Plate',
  'King City',
  'Schooler',
  '11',
  'Yuba City',
  '16',
  '8'],
 ['Rio de Plata Continuation High',
  '47',
  'California City',
  'Northwest Day',
  'Rio de Plata High',
  'West Point',
  'Le',
  'Platis',
  'Plate',
  'Gerrity'],
 ['Schooler',
  '16',
  '13',
  '21',
  '11',
  'Ungraded',
  'Preschool',
  '8',
  '7',
  'Plate'],
 ['California City',
  'Sixth and California',
  '5554 California St.',
  '1434 California St.',
  'Yuba City',
  '1707 California Drive',
  'Le',
  'The City',
  'Northwest Day',
  'Plate'],
 ['California City',
  'Sixth and California',
  '5554 California St.',
  '1434 California St.',
  'Yuba City',
  'The City',
  '1707 California Drive',
  'Le',
  'Plate',
  '11'],
 ['Ungraded', 'Schooler', '16', '11', '8', '13', '7', '21', 'Plate', 'Le']]

In [7]:
import nltk

for keyword, fsv in zip(kw, filtered_similar_values):
    for table_name, v in fsv.items():
        for col_name, value_list in v.items():
            edit_dists = []
            for value in value_list:
                edit_dists.append(nltk.edit_distance(value, keyword))
            if len(edit_dists) > 0:
                v[col_name] = [value_list[edit_dists.index(sorted(edit_dists)[0])]]


In [8]:
filtered_similar_values

[{'schools': {'AdmFName1': [],
   'AdmLName1': [],
   'City': ['Yuba City'],
   'County': [],
   'School': ['The City'],
   'MailCity': ['Yuba City']},
  'frpm': {'County Name': []},
  'satscores': {'cname': []}},
 {'schools': {'MailCity': [],
   'City': [],
   'AdmLName1': ['Platis'],
   'School': ['Rio de Plata High'],
   'Street': [],
   'StreetAbr': []}},
 {'frpm': {'School Type': ['Preschool']},
  'schools': {'SOCType': ['Preschool'],
   'School': ['Preschool'],
   'EILName': ['Preschool'],
   'AdmLName1': ['Schooler']}},
 {'schools': {'City': ['California City'],
   'MailCity': ['California City'],
   'MailStreet': ['Sixth and California'],
   'Street': ['Sixth and California'],
   'StreetAbr': ['1434 California St.'],
   'MailStrAbr': ['Sixth and California']}},
 {'schools': {'GSoffered': [],
   'AdmLName1': ['Le'],
   'AdmFName1': [],
   'Ext': ['11']},
  'frpm': {'County Code': [], 'High Grade': []}},
 {'schools': {'EILName': ['Ungraded'],
   'School': [],
   'MailStreet': [],