# Import packages

In [1]:
import pandas as pd
from teradataml import *
import teradatasql
import ast
from teradataml.dataframe.copy_to import copy_to_sql

from modules.db import TeradataDatabase
from utils import clean_text, load_embedding_model
from constants import CLEANED_TEST_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH
from modules.models import SentenceEmbeddingModel, SentenceEmbeddingConfig

Logger initialized. All logs will be saved to: c:\Users\ss255385\OneDrive - Teradata Corporation\Desktop\AMuRD-Iteration-7\src\logs\borai_20250818_133627.log


## Connect to database

In [2]:
td_db = TeradataDatabase()
td_db.connect()

### Insert Product Names in DB

In [3]:
df = pd.read_csv(CLEANED_TEST_DATA_PATH)
df.head()

Unnamed: 0,id,Item_Name,translated_text,cleaned_text
0,0,Americana Okra zero 400 gm,Americana Okra zero 400 gm,americana okra zero gm
1,1,ليمون اداليا 500 جم,Lemon Adalia 500 gm,lemon adalia gm
2,2,صلصه هاينز برطمان خصم عرض,Heinz Bartman Sauce Discount Offer,heinz bartman sauce discount offer
3,3,Dasani water 330ML,Dasani water 330ML,dasani water ml
4,4,بودرة عصير أناناس من سورس، 900 جم,"Soros Pineapple Juice Powder, 900g",soros pineapple juice powder g


In [4]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [5]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df = df[["id" , "product_name"]]

In [None]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Insert Class Names in DB

In [10]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,Americana Okra zero 400 gm,Vegetables & Fruits,Americana,400gm,1,,,,كيس,gm
1,ليمون اداليا 500 جم,"Tins, Jars & Packets",,500جم,1,,,,علبة,جم
2,صلصه هاينز برطمان خصم عرض,"Tins, Jars & Packets",هاينز,,1,,,,علبة,
3,Dasani water 330ML,Water,Dasani,330مل,1,,,,زجاجة,مل
4,بودرة عصير أناناس من سورس، 900 جم,Soft Drinks & Juices,سورس,900جم,1,,,,عبوة,جم


In [17]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [18]:
df["id"]= df.index
df = df[["id", "class"]]

In [19]:
copy_to_sql(df, "classes", "demo_user", if_exists="replace")

### Insert Product Embeddings in DB

In [None]:
df = pd.read_csv(PRODUCT_TEST_EMBEDDINGS_PATH)

Unnamed: 0,id,embeddings
0,0,"[0.0131988525390625, 0.0203704833984375, -0.00..."
1,1,"[0.0016574859619140625, 0.041656494140625, 0.0..."
2,2,"[0.0162811279296875, 0.018951416015625, -0.025..."
3,3,"[0.0190277099609375, 0.0229949951171875, -0.01..."
4,4,"[0.0226593017578125, 0.0291900634765625, -0.02..."


In [None]:
df['embeddings'] = df['embeddings'].apply(ast.literal_eval)
emb_cols = pd.DataFrame(df['embeddings'].tolist(), columns=[f'embed_{i}' for i in range(len(df['embeddings'][0]))])

df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [35]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user", if_exists="replace")

In [39]:
df_products = td_db.execute_query("Select * from demo_user.products")
df_products = DataFrame(df_products, False)

In [42]:
df_products



id,product_name
2,صلصه هاينز برطمان خصم عرض
4,بودرة عصير أناناس من سورس، 900 جم
5,بسكو مصر لوكس 6 قطعه علبه 12
6,Anise - 100g
8,مسحوق برسيل جيل باللافندر - 3.9 كجم
9,شكولاته الشمعدان بيور بندق
7,لوبيا بلدى - 500 جم
3,Dasani water 330ML
1,ليمون اداليا 500 جم
0,Americana Okra zero 400 gm


### Insert Class Embeddings in DB

In [None]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

In [None]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [18]:
query = f"""
CREATE MULTISET TABLE amurd.p_embeddings (
    product_id INTEGER NOT NULL,
    {",\n ".join([f"dim_{i} FLOAT" for i in range(1024)])}
) PRIMARY INDEX (product_id);
"""

In [19]:
td_db.execute_query(query)

[]

In [20]:
TABLE = "amurd.p_embeddings"
CSV_FILE = "full_embeddings.csv"  #Temp CSV

#Load and parse embeddings
df = pd.read_csv(PRODUCT_TEST_EMBEDDINGS_PATH, encoding='utf-8')
df['embedding'] = df['embeddings'].apply(lambda s: np.array(ast.literal_eval(s), dtype=np.float32))
df = df.dropna(subset=['embedding'])  #Drop failed parses
df['product_id'] = range(len(df))

#Detect dim from first embedding
dim = len(df['embedding'].iloc[0])

#Expand embeddings to columns v1-v{dim}
emb_df = pd.DataFrame(np.vstack(df['embedding']), columns=[f"v{i}" for i in range(1, dim + 1)])
parquet_df = pd.concat([df[['product_id']], emb_df], axis=1)

#Save expanded to temp CSV
parquet_df.to_csv(CSV_FILE, index=False)

In [None]:
cols = ["product_id"] + [f"dim_{i}" for i in range(0, dim)]
placeholders = ", ".join(["?"] * len(cols))

q = f"{{fn teradata_read_csv({CSV_FILE})}}INSERT INTO {TABLE} ({', '.join(cols)}) VALUES ({placeholders})"
tdf = td_db.execute_query(q)

In [None]:
tdf = td_db.execute_query("SELECT * FROM amurd.p_embeddings")
tdf = DataFrame(tdf, False)
tdf

In [None]:
query = f"""
CREATE MULTISET TABLE amurd.c_embeddings (
    class_id INTEGER NOT NULL,
    {",\n ".join([f"dim_{i} FLOAT" for i in range(1024)])}
) PRIMARY INDEX (class_id);
"""

In [None]:
td_db.execute_query(query)

In [None]:
TABLE = "amurd.c_embeddings"
CSV_FILE = "full_embeddings.csv"  #Temp CSV

#Load and parse embeddings
df = pd.read_csv(CLASS_EMBEDDINGS_PATH, encoding='utf-8')
df['embedding'] = df['embeddings'].apply(lambda s: np.array(ast.literal_eval(s), dtype=np.float32))
df = df.dropna(subset=['embedding'])  #Drop failed parses
df['row_id'] = range(1, len(df) + 1)

#Detect dim from first embedding
dim = len(df['embedding'].iloc[0])

#Expand embeddings to columns v1-v{dim}
emb_df = pd.DataFrame(np.vstack(df['embedding']), columns=[f"v{i}" for i in range(1, dim + 1)])
parquet_df = pd.concat([df[['row_id']], emb_df], axis=1)

#Save expanded to temp CSV
parquet_df.to_csv(CSV_FILE, index=False)

In [None]:
cols = ["row_id"] + [f"v{i}" for i in range(1, dim + 1)]
placeholders = ", ".join(["?"] * len(cols))


q = f"{{fn teradata_read_csv({CSV_FILE})}}INSERT INTO {TABLE} ({', '.join(cols)}) VALUES ({placeholders})"
tdf = td_db.execute_query(q)

In [None]:
tdf = td_db.execute_query("SELECT COUNT(*) FROM amurd.c_embeddings")
tdf = DataFrame(tdf, False)
tdf

## IN-DB Similiraty

In [None]:
RESULT_TABLE = "amurd.similiratiy_score"

In [None]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [None]:
td_db.execute_query(q)

In [None]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"v{i}" for i in range(1, 1024 + 1)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'v{i}'" for i in range(1, 1024 + 1)])

In [None]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS item_id,
        o.Reference_ID AS category_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT TOP 5 row_id, {vector_cols} FROM {PRODUCT_TEST_EMBEDDINGS_PATH}) AS TargetTable
        ON (SELECT category_id, {vector_cols} FROM {CLASS_EMBEDDINGS_PATH}) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('row_id')
            RefIDColumn('category_id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    item_id,
    category_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(q)

In [None]:
tdf = td_db.execute_query("SELECT COUNT(*) FROM amurd.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf

## Disconnect

In [None]:
td_db.disconnect()