In [None]:
%run "../01-check_setup.ipynb"

In [None]:
import requests, os

os.environ["USER_AGENT"] = f"{requests.get('https://httpbin.org/user-agent').json()['user-agent']}"
os.environ["USER_AGENT"]

In [None]:
import zipfile
import requests
from io import BytesIO

# URL of the zipped file
zip_url = "https://docs.python.org/3/archives/python-3.13-docs-text.zip"

# Download the zipped file
response = requests.get(zip_url)
zip_file = BytesIO(response.content)

# Load documents from the zipped file
documents_from_zip = []
with zipfile.ZipFile(zip_file, 'r') as z:
    for file_name in z.namelist():
        if not z.getinfo(file_name).is_dir():
            with z.open(file_name) as f:
                documents_from_zip.append({'metadata': {'file_name': file_name}, 'content': f.read().decode('utf-8')})

# Check the number of documents loaded
print(f"Number of documents loaded: {len(documents_from_zip)}")

In [None]:
## Select only records where file_name contains '/faq/'
faq_documents = [doc for doc in documents_from_zip if '/faq/' in doc['metadata']['file_name']]  
print(f"Number of FAQ documents: {len(faq_documents)}")

In [None]:
import pandas as pd

## Convert faq_documents to a DataFrame `df_faq`
df_faq = pd.DataFrame(faq_documents)


In [None]:
# Convert column metadata of df_faq to string
df_faq['metadata'] = df_faq['metadata'].astype(str)

In [None]:
display(df_faq.dtypes)
df_faq=df_faq.convert_dtypes()
display(df_faq.dtypes)

In [None]:
hana_table_name="FAQ"

In [None]:
## Create a table in SAP HANA from `df_faq`
from hana_ml.dataframe import create_dataframe_from_pandas
hdf_faq_bronze = create_dataframe_from_pandas(connection_context=myconn, 
                                              pandas_df=df_faq, 
                                              table_name=hana_table_name, 
                                              force=True,
                                              object_type_as_bin=True,
                                              table_structure={'metadata': 'NVARCHAR(5000)' ,'content': 'NCLOB'}
                                              )

In [None]:
hdf_faq_bronze.head().collect()

## Generating Text Embeddings in SAP HANA Cloud

In [None]:
content_column = 'content'

In [None]:
print(f"""Number of records selected for further processing: {hdf_faq_bronze.count()}""")

In [None]:
### Generating Text Embeddings in SAP HANA Cloud with the new PAL function, function available with hana-ml 2.23.
from hana_ml.text.pal_embeddings import PALEmbeddings
pe = PALEmbeddings()
hdf_faq_silver = pe.fit_transform(hdf_faq_bronze.add_id(), key="ID", target=[f"{content_column}"], thread_number=10, batch_size=10) #, max_token_num=512
print(f"{hdf_faq_silver.count()} records processed in {round(pe.runtime, 3)} sec")

In [None]:
hdf_faq_silver.get_table_structure()

In [None]:
hdf_faq_silver.head(1).collect()

In [None]:
hdf_faq_silver.select_statement

In [None]:
hdf_faq_silver=hdf_faq_silver.save(where="#FAQ_EMBEDDINGS", force=True)

## Semantic search in FAQ

In [None]:
prompt="How is Monty Python relates to Python programming language?"

In [None]:
df_result = myconn.sql(
    f"""SELECT TOP 5
    COSINE_SIMILARITY(VECTOR_EMBEDDING('{prompt}', 'DOCUMENT', 'SAP_NEB.20240715'), "VECTOR_COL_{content_column}") AS "SIMILARITY",
    "ID", "{content_column}"
    FROM ({hdf_faq_silver.select_statement})
    ORDER BY 1 DESC;
    """
).collect()

In [None]:
df_result.head(3)

In [None]:
# Print the rows of the 'content' column
print(df_result['content'][0])

In [None]:
from IPython.display import Markdown

# Convert the rows of the 'content' column to markdown format
display(Markdown(df_result['content'][0]))