In [None]:
%run "../01-check_setup.ipynb"

In [None]:
import requests, os

os.environ["USER_AGENT"] = f"{requests.get('https://httpbin.org/user-agent').json()['user-agent']}"
os.environ["USER_AGENT"]

In [None]:
from loader_sapblog import CustomSAPBlogLoader

# Define URLs
urls = [
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/technology-blog-sap/article-id/177423", # "New Machine Learning and NLP features in SAP HANA Cloud 2024 Q4"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/technology-blog-sap/article-id/177557", # "Text Chunking – An Exciting New NLP Function in SAP HANA Cloud"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/erp-blog-sap/article-id/58847", # "New Text Analysis in SAP HANA Cloud Predictive Analysis Library (PAL)"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/erp-blog-sap/article-id/58846", # "Text Embedding Service in SAP HANA Cloud Predictive Analysis Library (PAL)"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/technology-blog-sap/article-id/177553", # "New Information Retrieval Techniques in SAP HANA Cloud using BM25 and ANNS for Advanced Text Mining"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/erp-blog-sap/article-id/58845", # "Dimensionality Reduction of Text Embeddings for Hybrid Prediction Data"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/erp-blog-sap/article-id/58419", # "Document Clustering using KMeans and Text Embeddings"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/technology-blog-sap/article-id/176747", # "Hybrid Prediction with Tabular and Text Inputs using Hybrid Gradient Boosting Trees"
    "https://community.sap.com/t5/blogs/blogarticleprintpage/blog-id/technology-blog-sap/article-id/177560", # "Exploring ML Explainability in SAP HANA PAL - AutoML"

]

# Load and process articles
loader = CustomSAPBlogLoader(urls)
(documents := loader.load()).__len__()


In [None]:

# Check the length of each article's content
for doc in documents:
    print(doc.metadata["source"], len(doc.page_content), end="\n")


In [None]:
print(documents[0].page_content[:300])

In [None]:
connection_to_hana = myconn.connection

In [None]:
from langchain_community.vectorstores.hanavector import HanaDB
from embedder_saphana import HanaEmbeddings

db = HanaDB(
    embedding=HanaEmbeddings(connection_to_hana), connection=connection_to_hana, table_name="MY_LANGCHAIN_EMBEDDINGS"
)

print(f"Table to be used is {db.table_name}")

In [None]:
db.delete(filter={})

In [None]:
%%time
db.add_documents(documents=documents)

## Text splitting, preparing articles's text for vectorization

In [None]:
hdf_langchain_tab = myconn.table(db.table_name).add_id()

In [None]:
hdf_langchain_tab.get_table_structure()

In [None]:
import pandas as pd
pd.set_option('max_colwidth', 128) 

hdf_langchain_tab.head(1).collect().T

In [None]:
# Applying the Text Splitter with recursive-splitting, available with hana-ml 2.23
from hana_ml.text.text_splitter import TextSplitter

splitter = TextSplitter(split_type='recursive', chunk_size=1024, overlap=64)
#splitter._extend_pal_parameter({'GLOBAL_SEPARATOR':'[. ]', 'KEEP_SEPARATOR':1})
splitted_text = splitter.split_text(
    hdf_langchain_tab.select('ID', 'VEC_TEXT'), 
    order_status=True
    )

In [None]:
import pandas as pd
pd.set_option('max_colwidth', None) 

print(splitted_text.shape)
display(splitter.statistics_.collect())

display(splitted_text.select("*", ('LENGTH("CONTENT")', "CHUNCK_SIZE")).head(10).collect())

## Generating Text Embeddings in SAP HANA Cloud

In [None]:
print(f"""Number of records selected for further processing: {splitted_text.count()}""")

In [None]:
### Generating Text Embeddings in SAP HANA Cloud with the new PAL function, function available with hana-ml 2.23.
from hana_ml.text.pal_embeddings import PALEmbeddings
pe = PALEmbeddings(model_version='SAP_GXY.20250407')
textembeddings = pe.fit_transform(splitted_text, key="SUB_ID", target=["CONTENT"], thread_number=10, batch_size=10) #, max_token_num=512
print(f"{textembeddings.count()} records processed in {round(pe.runtime, 3)} sec")

In [None]:
textembeddings.get_table_structure()

In [None]:
textembeddings.head().collect()

In [None]:
prompt="How do I do text chunking in SAP HANA?"

In [None]:
myconn.sql(
    f"""SELECT 
    COSINE_SIMILARITY(VECTOR_EMBEDDING('{prompt}', 'QUERY', 'SAP_GXY.20250407'), "VECTOR_COL_CONTENT") AS "SIMILARITY", 
    L2DISTANCE(VECTOR_EMBEDDING('What{prompt}', 'QUERY', 'SAP_GXY.20250407'), "VECTOR_COL_CONTENT") AS "DISTANCE", 
    "ID", "SUB_ID", "CONTENT" 
    FROM ({textembeddings.select_statement})
    ORDER BY 1 DESC;
    """
).collect()