The following code snippet is authored by:<br>
- Markus Fath https://github.com/fath-markus

# Vectorize table content
* load some data from csv into a hana table
* read some text values from that table
* generate embeddings
* add a vcetor column to the table
* update the table with the vectors

In [1]:
import pandas as pd
import hana_ml
print(pd.__version__)
print(hana_ml.__version__)

ModuleNotFoundError: No module named 'shapely'


1.4.4
2.18.23110300


In [None]:
# get a csv file into pandas df
df = pd.read_csv('./data/INIS_NEWS_APPLICATION.csv', sep=';', quotechar='"', low_memory=False)
df.head(3)

In [4]:
from hana_ml import ConnectionContext
# cc = ConnectionContext(userkey='VDB_BETA', encrypt=True)
cc= ConnectionContext(
    address='[somehost].hanacloud.ondemand.com', 
    port='443', 
    user='[your user]', 
    password='[your password]', 
    encrypt=True
    )
print(cc.hana_version())
print(cc.get_current_schema())

4.50.000.00.0000000000 (xxxFASTDEVMAKExxx)
VDB


In [5]:
# hanal-ml doesn't deal with CLOBS, so create the table by hand
# DDL
cursor = cc.connection.cursor()
sql_command = '''CREATE TABLE "NEWS_APPL" (
    "Key" NVARCHAR(5000),
    "Date" NVARCHAR(5000),
    "No" BIGINT,
    "TopicNo" BIGINT,
    "TopicID" NVARCHAR(5000),
    "TopicName" NVARCHAR(5000),
    "Domain" NVARCHAR(5000),
    "arXivID" DOUBLE,
    "Base" NVARCHAR(5000),
    "Link" NVARCHAR(5000),
    "SenderHTML" NVARCHAR(5000),
    "SenderName" NVARCHAR(5000),
    "Title" NVARCHAR(5000),
    "Abstract" NCLOB MEMORY THRESHOLD 0)'''
cursor.execute(sql_command)
cursor.close()

In [6]:
# import dataframe into hana table
from hana_ml.dataframe import create_dataframe_from_pandas
v_hdf = create_dataframe_from_pandas(
    connection_context=cc,
    pandas_df=df,
    table_name="NEWS_APPL", 
    allow_bigint=True, 
    append=True,
    force=False)

100%|██████████| 1/1 [00:00<00:00,  1.31it/s]


In [7]:
# get some data from the newly created table
# you could join multiple text columns in SQL
hdf = cc.sql('''SELECT TOP 10 "Key", "Abstract" FROM NEWS_APPL''')
df_abstract = hdf.collect()

In [8]:
# get embedding
from llm_commons.proxy.openai import Embedding
def get_embedding(input, model="text-embedding-ada-002-v2") -> str:
    response = Embedding.create(
      deployment_id=model,
      input=input
    )
    return response.data[0].embedding

In [13]:
# generate embeddings from the text
rows = []
for index, row in df_abstract.iterrows():
    text = row['Abstract']
    try:
        text_vector = get_embedding(input=text)
        # text_vector = '[0, 1]'
        myrow = (str(text_vector), row['Key'])
        rows.append(myrow)
    except Exception as e:
        print(e)

In [10]:
# add a vector column to your table
cursor = cc.connection.cursor()
sql_command = '''ALTER TABLE NEWS_APPL ADD (VECTOR REAL_VECTOR(1536))'''
cursor.execute(sql_command)
cursor.close()

In [27]:
# bulk update
cc.connection.setautocommit(False)
cursor = cc.connection.cursor()
sql = 'UPDATE NEWS_APPL SET VECTOR = TO_REAL_VECTOR(?) WHERE "Key" = ?'
try:
    cursor.executemany(sql, rows)
except Exception as e:
    cc.connection.rollback()
    print("An error occurred:", e)
try:
    cc.connection.commit()
finally:
    cursor.close()
cc.connection.setautocommit(True)