# Import packages

In [None]:
import pandas as pd
from teradataml import *

from modules.db import TeradataDatabase
from utils import clean_text, load_embedding_model
from constants import TEST_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH
from modules.models import SentenceEmbeddingModel, SentenceEmbeddingConfig

## Connect to database

In [None]:
td_db = TeradataDatabase()
td_db.connect()

## Load data to database

In [None]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

In [None]:
df = df[["Item_Name", "Brand"]]
df.rename(columns={"Item_Name": "product_name", "Brand": "brand_name"}, inplace=True)

In [None]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df

In [None]:
df["brand_name"] = df["brand_name"].astype(str)

In [None]:
tdf = DataFrame(df)

In [None]:
tdf = tdf.assign(product_id=tdf.index_label)
tdf = tdf.drop(columns=["index_label"])
tdf

In [None]:
tdf = tdf.select(["product_id", "product_name", "brand_name"])
tdf

In [None]:
copy_to_sql(tdf, "products", "amurd", if_exists="replace")

## Load table from database

In [None]:
",\n ".join([f"dim_{i} FLOAT" for i in range(1024)])

In [None]:
query = f"""
CREATE MULTISET TABLE amurd.p_embeddings (
    product_id INTEGER NOT NULL,
    {",\n ".join([f"dim_{i} FLOAT" for i in range(1024)])}
) PRIMARY INDEX (product_id);
"""

In [None]:
tdf = td_db.execute_query(query)

In [None]:
df = pd.read_csv(PRODUCT_TEST_EMBEDDINGS_PATH)

In [None]:
df['embeddings'].iloc[5]

In [None]:
l = json.loads(df.iloc[0, 1])
l = [str(embed) for embed in l]
l

In [None]:
import json
# for row in df.iterrows():
#     id = row[0]
#     embeddings = row[1]
#     print(id, embeddings)
#     break
queries = ""
for i in range(1):
    id = df.iloc[i, 0]
    embeddings = json.loads(df.iloc[i, 1])
    embeddings = [str(embed) for embed in embeddings]
    for j in embeddings:
        queries += f"""
    INSERT INTO amurd.p_embeddings columns VALUES ({id}, {float(j)});\n
"""

In [None]:
print(queries)

In [None]:
tdf = td_db.execute_query(print(queries))

In [None]:
query = f"""
CREATE MULTISET TABLE amurd.c_embeddings (
    class_id INTEGER NOT NULL,
    {",\n ".join([f"dim_{i} FLOAT" for i in range(1024)])}
) PRIMARY INDEX (class_id);
"""

In [None]:
tdf = td_db.execute_query(print(query))

In [None]:
df = pd.read_csv(CLASS_EMBEDDINGS_PATH)

In [None]:
queries = ""
for i in range(1):
    id = df.iloc[i, 0]
    embeddings = json.loads(df.iloc[i, 1])
    embeddings = [str(embed) for embed in embeddings]
    for j in embeddings:
        queries += f"""
    INSERT INTO amurd.p_embeddings columns VALUES ({id}, {float(j)});\n
"""

In [None]:
tdf = td_db.execute_query(print(queries))

## Disconnect

In [None]:
td_db.disconnect()