20250318 new LC features
running on FOR and LC_INTEGRATION

In [1]:
# connect to HANA
from hana_ml.dataframe import ConnectionContext
cc = ConnectionContext(userkey='FOR_DEMO')

In [16]:
# clean up
schema_name = "LC3_DEMO"

try:
    cur = cc.connection.cursor()
    cur.execute(f''' DROP SCHEMA {schema_name} CASCADE ''')
except:
    pass
finally:
    cur.execute(f''' CREATE SCHEMA {schema_name} ''') 
    cur.execute(f''' SET SCHEMA {schema_name} ''') 
    cur.close()

In [17]:
# create a custom table that can be used as LangChain Vector Store
table_name = "CUSTOM_TABLE"

cc.connection.cursor().execute(
    (
        f'''CREATE TABLE {table_name} ('''
        ''' "MY_TEXT" NVARCHAR(5000), '''
        ''' "MY_METADATA" NVARCHAR(5000), '''
        ''' "MY_VECTOR" REAL_VECTOR(768), '''
        ''' "title" NVARCHAR(5000), '''
        ''' "year" INTEGER,'''
        ''' "tagline" NCLOB)'''
    )
)

True

In [18]:
import langchain
import langchain_community
# the HanaDB vector store used to be in the langchain_community package: langchain_community.vectorstores.hanavector
# and is now moved to a separate package: langchain_hana
# https://github.com/SAP/langchain-integration-for-sap-hana-cloud
import langchain_hana

In [19]:
print('langchain version:', langchain.__version__)
print('langchain_community version:', langchain_community.__version__)
print('langchain_hana version:', langchain_hana.__version__)


langchain version: 0.3.19
langchain_community version: 0.3.18
langchain_hana version: 0.1.0


---
SAP HANA QRC4 2024: in-database embeddings<br>
also supported in LangChain

In [20]:
# instead of using an external embedding model, e.g. OpenAIEmbeddings, for a LangChain Vector Store
# we use the SAP HANA Cloud in-database embedding model
# see https://help.sap.com/docs/hana-cloud-database/sap-hana-cloud-sap-hana-database-vector-engine-guide/creating-text-embeddings-in-sap-hana-cloud
from langchain_hana import HanaInternalEmbeddings

internal_embeddings  = HanaInternalEmbeddings(internal_embedding_model_id="SAP_NEB.20240715")

In [21]:
# we have created a HANA table upfront, now let's put a LangChain Vector Store on top
# to use custom columns in LangChain Vector Store, the columns need to be enumerated
from langchain_hana import HanaDB

lc_vs = HanaDB(
    embedding=internal_embeddings,
    connection=cc.connection, 
    table_name=table_name,
    content_column = "MY_TEXT",
    metadata_column = "MY_METADATA",
    vector_column = "MY_VECTOR",
    specific_metadata_columns = ["title","year","tagline"]
)

In [22]:
# create some documents which we can insert into the Vector Store
from langchain.docstore.document import Document

docs = [
    Document(
        page_content = '''Barbie ("Stereotypical Barbie") and fellow dolls reside in Barbieland, a matriarchal society populated by different versions of Barbies, Kens, and a group of discontinued models who are treated like outcasts due to their unconventional traits. While the Kens spend their days playing at the beach, considering it their profession, the Barbies hold prestigious jobs in law, science, politics, and so on. Ken ("Beach Ken") is only happy when he is with Barbie, and seeks a closer relationship with her, but she rebuffs him in favor of other activities and female friendships.''',
        metadata={"title": "Barbie", "year": 2023, "tagline": "Barbie and Ken in the real world"},
    ),
    Document(
        page_content = '''In Los Angeles, Paula Powers (Nancy Morgan) wealthy parents, Bigby (Barry Cahill) and Priscilla Powers (Elizabeth Rogers), want her to marry Collins Hedgeworth (Paul Linke), whom they refer to as her fiance, and also hails from a wealthy family. Paula is really in love with classmate Sam Freeman (Ron Howard), an environmental research major, however, her father dismisses him as a "fortune hunter," which they dispute. Bigby yells at Sam to leave, and while he tries to defend himself, Paula tells him to go wait for her outside. Bigby tells Paula that he is running for governor and wants her to cooperate. However, Paula dismisses Collins as a "flake," and tells her parents that she won't marry him and will elope with Sam to Las Vegas. Bigby threatens to disinherit her and take away her sports car (that she had bought with her own money) if she disobeys him. Paula goes to her room and escapes through the window, stealing her parents, Rolls-Royce Silver Cloud and hitting the road with Sam, beginning a wild explosive car chase and race towards Las Vegas. Bigby calls his associate Ned Slinker (played by Ron Howard's real-life father Rance), asking him to bring back Paula and the Rolls-Royce, and to have Sam incarcerated, without involvement of police and news media. Priscilla gets a call from Collins, who is currently in a stable playing polo, and tells him Paula ran off. Enraged, Collins smashes the phone and takes off in his car. After crashing it, he steals another car from a nearby dealership to continue the chase. His mother Vivian (Marion Ross, Ron Howard's co-star on Happy Days), after being informed of it by the police, decides to go after him herself before the police could arrest him.''',
        metadata={"title": "Grand Theft Auto", "year": 1977, "tagline": "Cars and automobiles all over"},
    ),
]

lc_vs.add_documents(docs)

In [23]:
# let's check the table's content. The content in column MY_VECTOR is created by the in-database embedding function
hdf = cc.sql(f''' SELECT * FROM {table_name} ''')
df = hdf.head(5).collect()
df

Unnamed: 0,MY_TEXT,MY_METADATA,MY_VECTOR,title,year,tagline
0,"Barbie (""Stereotypical Barbie"") and fellow dol...","{""title"": ""Barbie"", ""year"": 2023, ""tagline"": ""...","[-0.02645867131650448, -0.0883205458521843, 0....",Barbie,2023,Barbie and Ken in the real world
1,"In Los Angeles, Paula Powers (Nancy Morgan) we...","{""title"": ""Grand Theft Auto"", ""year"": 1977, ""t...","[-0.08580755442380905, -0.05913298949599266, 0...",Grand Theft Auto,1977,Cars and automobiles all over


---
SAP HANA supports full-text search<br>
this is also supported in LangChain

In [25]:
# query contains a natural language text string which is used in semantic/similarity search
query = "a film about dolls and society"

# additional filter expressions are translated to a SQL WHERE clause
filter = {}
# filter = {"title": "Barbie", "year": {"$gt": 2022}}
# filter = {"MY_TEXT": {"$contains": 'family wealthy rolls royce'}} #full-text search via contains
filter = {"tagline": {"$contains": 'ken and barbie'}} #full-text search via contains

res = lc_vs.similarity_search(query=query, k=4, filter=filter)

print(lc_vs._create_where_by_filter(filter))

for doc in res:
    print("-" * 80)
    print(doc.metadata)
    print(doc.page_content)
    

(' WHERE SCORE(? IN ("tagline" EXACT SEARCH MODE \'text\')) > 0', ['ken and barbie'])
--------------------------------------------------------------------------------
{'title': 'Barbie', 'year': 2023, 'tagline': 'Barbie and Ken in the real world'}
Barbie ("Stereotypical Barbie") and fellow dolls reside in Barbieland, a matriarchal society populated by different versions of Barbies, Kens, and a group of discontinued models who are treated like outcasts due to their unconventional traits. While the Kens spend their days playing at the beach, considering it their profession, the Barbies hold prestigious jobs in law, science, politics, and so on. Ken ("Beach Ken") is only happy when he is with Barbie, and seeks a closer relationship with her, but she rebuffs him in favor of other activities and female friendships.


In [26]:
# to speed up full-text search, add a fuzzy index
cc.connection.cursor().execute(
    (
        f'''CREATE FUZZY SEARCH INDEX FUZZYSEARCH_{schema_name}_{table_name}_TEXT_IDX ON {schema_name}.{table_name}("MY_TEXT") SEARCH MODE TEXT'''
    )
)

True

---
SAP HANA Cloud QRC3 2024: HNSW vector index<br>
The HNSW index speeds up (approximate) nearest neighbor searches<br>
Add an index if you search over more than 1Mio vectors and require less than 1s query runtime<br>
Also suported in LangChain


In [55]:
lc_vs.create_hnsw_index(
    m=64,  # Max number of neighbors per graph node (valid range: 4 to 1000)
    ef_construction=128,  # Max number of candidates during graph construction (valid range: 1 to 100000)
    ef_search=200,  # Min number of candidates during the search (valid range: 1 to 100000)
)

---
LangChain "self query retriever"

In [27]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_hana.query_constructors import HanaTranslator

from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
llm = ChatOpenAI(proxy_model_name='gpt-4o', temperature=0)

In [28]:
# we need to describe the vector store table and its columns
metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the movie",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="tagline",
        description="The tagline of the movie",
        type="string",
    ),
]

document_content_description = "A collection of movies"

hana_translator = HanaTranslator()

retriever = SelfQueryRetriever.from_llm(
    llm,
    lc_vs,
    document_content_description,
    metadata_field_info,
    structured_query_translator=hana_translator
)

In [29]:
# let's run a query which combines vector search and a standard attribute filter
query_prompt = "Which movies about cars were released before 2010?"
# query_prompt = "movie title contains 'auto theft'"

docs = retriever.invoke(input=query_prompt)
retriever.structured_query_translator

for doc in docs:
    print("-" * 80)
    print(doc.metadata)

--------------------------------------------------------------------------------
{'title': 'Grand Theft Auto', 'year': 1977, 'tagline': 'Cars and automobiles all over'}


In [30]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

sq = query_constructor.invoke(input=query_prompt)

print("Structured query: ", sq)

print("\nTranslated for HANA vector store: ", hana_translator.visit_structured_query(sq))

Structured query:  query='cars' filter=Comparison(comparator=<Comparator.LT: 'lt'>, attribute='year', value=2010) limit=None

Translated for HANA vector store:  ('cars', {'filter': {'year': {'$lt': 2010}}})
