# Default notebook

This default notebook is executed using a Lakeflow job as defined in resources/sample_job.job.yml.

In [0]:
%pip install databricks-vectorsearch flashrank databricks_langchain
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# from databricks.vector_search.client import VectorSearchClient
# from databricks.sdk import WorkspaceClient
# from databricks_langchain import DatabricksVectorSearch, DatabricksEmbeddings
# import databricks.sdk.service.catalog as c
# from mlflow.deployments import get_deploy_client
# from flashrank import Ranker, RerankRequest


# vsc = VectorSearchClient(disable_notice=True)
# ranker = Ranker(
#     model_name="rank-T5-flan", cache_dir="/Volumes/dev_appian_poc/00_bronze/artifacts"
# )


# embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

# if not vsc.endpoint_exists("appian_vsc_poc"):
#     vsc.create_endpoint(name="appian_vsc_poc")


# if not vsc.index_exists(
#     endpoint_name="appian_vsc_poc",
#     index_name="dev_appian_poc.02_gold.ingested_text_index",
# ):
#     index = vsc.create_delta_sync_index(
#         endpoint_name="appian_vsc_poc",
#         source_table_name="dev_appian_poc.02_gold.ingestion_text_embeddings",
#         index_name="dev_appian_poc.02_gold.ingested_text_index",
#         pipeline_type="TRIGGERED",
#         primary_key="id",
#         embedding_vector_column="embedding",
#         embedding_dimension=1024,
#         embedding_model_endpoint_name="databricks-bge-large-en",
#     )
# else:
#     index = vsc.get_index(
#         endpoint_name="appian_vsc_poc",
#         index_name="dev_appian_poc.02_gold.ingested_text_index",
#     )


# vector_store = DatabricksVectorSearch(
#     index_name="dev_appian_poc.02_gold.ingested_text_index",
#     text_column="content",
#     embedding=embedding_model,
# )

# sim = vector_store.as_retriever(search_kwargs={"k": 2}).invoke("who is robert peng")

# print(sim)

In [0]:
from databricks.vector_search.client import VectorSearchClient
from databricks.sdk import WorkspaceClient
from databricks_langchain import DatabricksVectorSearch, DatabricksEmbeddings
import databricks.sdk.service.catalog as c
from mlflow.deployments import get_deploy_client
from flashrank import Ranker, RerankRequest


def get_retriever(persist_dir: str = "/Volumes/dev_appian_poc/00_bronze/artifacts", k: int = 2):
    vsc = VectorSearchClient(disable_notice=True)
    embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

    if not vsc.index_exists(
        endpoint_name="appian_vsc_poc",
        index_name="dev_appian_poc.02_gold.ingested_text_index",
    ):
        index = vsc.create_delta_sync_index(
            endpoint_name="appian_vsc_poc",
            source_table_name="dev_appian_poc.02_gold.ingestion_text_embeddings",
            index_name="dev_appian_poc.02_gold.ingested_text_index",
            pipeline_type="TRIGGERED",
            primary_key="id",
            embedding_vector_column="embedding",
            embedding_dimension=1024,
            embedding_model_endpoint_name="databricks-bge-large-en",
        )
    else:
        index = vsc.get_index(
            endpoint_name="appian_vsc_poc",
            index_name="dev_appian_poc.02_gold.ingested_text_index",
        )

    vector_store = DatabricksVectorSearch(
        index_name="dev_appian_poc.02_gold.ingested_text_index",
        text_column="content",
        embedding=embedding_model,
        )
    return vector_store.as_retriever(search_kwargs={"k": k})
    

In [0]:
def get_ranked_query(query: str, k: int = 2):
    vs = get_retriever(k=k)
    similar_documents = vs.invoke(query)
    results = [each.page_content for each in similar_documents]
    passages_dicts = [{"text": passage} for passage in results]

    ranker = Ranker(
        model_name="rank-T5-flan",
        cache_dir="/Volumes/dev_appian_poc/00_bronze/artifacts",
    )

    rerankrequest = RerankRequest(query=query, passages=passages_dicts)

    return ranker.rerank(rerankrequest)

In [0]:
rerankedresponse = get_ranked_query("what school did robert peng go to", k=2)

print(rerankedresponse)



[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
[{'text': '- Served as engagement manager, communicated project progress and risks to stakeholders, managed project timeline, budget, billing and SOW. EDUCATION BARNARD COLLEGE, COLUMBIA UNIVERSITY NEW YORK, NY Bachelor of Arts in Political Science May 2013 SKILLS LANGUAGES: Chinese Mandarin & Cantonese (Proficient), Korean (Conversational)\n', 'score': np.float32(0.48015377)}, {'text': 'Robert Peng Denver, CO, linkedin.com/in/robert-o-59652128 (765) 543-7382 | robertjustinianpeng@gmail.com EDUCATION Indiana University Bachelor of Computer Science- 2014 Bloomington WORK EXPERIENCE Deloitte Denver, Colorado Senior Manager - Consulting Aug 2021 Served as the project manager for multiple federal engagements managing and coordinating with cross functional teams to implement on-premis