In [None]:
%run "../01-check_setup.ipynb"

In [None]:
hana_table_name="FAQ"

In [None]:
# from hana_ml.dataframe import ConnectionContext

## Create a dataframe for a HANA table
hdf_faq_bronze = myconn.table(hana_table_name)

In [None]:
import pandas as pd
pd.set_option('max_colwidth', 256) 

hdf_faq_bronze.head().collect().T

## Text splitting, preparing FAQs for vectorization

In [None]:
hdf_faq_bronze.add_id().head().collect().T

In [None]:
# Applying the Text Splitter with recursive-splitting, available with hana-ml 2.23
from hana_ml.text.text_splitter import TextSplitter

splitter = TextSplitter(split_type='document', doc_type='html', keep_separator=True, overlap=4)
splitter._extend_pal_parameter({'GLOBAL_SEPARATOR':'[<h1>,<h2>,<h3>,<h4>,<h5>,<h6>]'})
hdf_faq_silver = splitter.split_text(
    hdf_faq_bronze.add_id().select('ID', 'content_html'), 
    order_status=True
    )

In [None]:
import pandas as pd
pd.set_option('max_colwidth', None) 

print(hdf_faq_silver.shape)
display(splitter.statistics_.collect())
display(hdf_faq_silver.select("*", ('LENGTH("CONTENT")', "CHUNCK_SIZE")).head(10).collect())

## Generating Text Embeddings in SAP HANA Cloud

In [None]:
content_column = 'CONTENT'

In [None]:
print(f"""Number of records selected for further processing: {hdf_faq_silver.count()}""")

In [None]:
hdf_faq_silver.get_table_structure()

In [None]:
### Generating Text Embeddings in SAP HANA Cloud with the new PAL function, function available with hana-ml 2.23.
from hana_ml.text.pal_embeddings import PALEmbeddings
pe = PALEmbeddings(model_version='SAP_GXY.20250407')
hdf_faq_gold = pe.fit_transform(hdf_faq_silver, key="SUB_ID", target=[f"{content_column}"], thread_number=10, batch_size=10) #, max_token_num=512
print(f"{hdf_faq_gold.count()} records processed in {round(pe.runtime, 3)} sec")

In [None]:
hdf_faq_gold.get_table_structure()

In [None]:
hdf_faq_gold.head(2).collect()

In [None]:
hdf_faq_gold.select_statement

In [None]:
hdf_faq_gold=hdf_faq_gold.save(where="#FAQ_EMBEDDINGS", force=True)

## Semantic search in FAQ

In [None]:
prompt="What are origins of the Python name?"

In [None]:
df_result = myconn.sql(
    f"""SELECT TOP 5
    COSINE_SIMILARITY(VECTOR_EMBEDDING('{prompt}', 'QUERY', 'SAP_GXY.20250407'), "VECTOR_COL_{content_column}") AS "SIMILARITY",
    "ID", "{content_column}"
    FROM ({hdf_faq_gold.select_statement})
    ORDER BY 1 DESC;
    """
).collect()

In [None]:
df_result.head(5)

In [None]:
# Print the rows of the 'content' column
print(df_result['CONTENT'][0])

In [None]:
from IPython.display import HTML

# Convert the rows of the 'content' column to markdown format
display(HTML(df_result['CONTENT'][0]))