# Import packages

In [1]:
import pandas as pd
from teradataml import *
import ast
import json
import torch
from teradataml.dataframe.copy_to import copy_to_sql
from transformers.generation import GenerationMixin

from modules.db import TeradataDatabase
from modules.models import( 
    OpusTranslationModelConfig, 
                        OpusTranslationModel, 
                        SentenceEmbeddingConfig, 
                        SentenceEmbeddingModel, 
                        Falcon3EmbeddingModel
)
from utils import clean_text, load_embedding_model, unicode_clean, load_translation_model, load_falcon3_embedding_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, FALCON3_7B_CONFIG_PATH

)


Logger initialized. All logs will be saved to: c:\Users\ss255385\OneDrive - Teradata Corporation\Desktop\AMuRD-Iteration-7\src\logs\borai_20250820_001235.log


## Connect to database

In [2]:
td_db = TeradataDatabase()
td_db.connect()

### Insert Product Names in DB

In [112]:
df = pd.read_csv(TRAIN_VAL_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [113]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [114]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df = df[["id" , "product_name"]]

In [None]:
df['product_name'] = df['product_name'].apply(unicode_clean)


In [116]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Cleaning the Products

In [117]:
cleaning_query = """
UPDATE demo_user.products
SET product_name = LOWER(
                  TRIM(
                    REGEXP_REPLACE(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(product_name, '[[:digit:]]+', ''), 
                        '[-_/\\|]', ' '),                              
                      '[[:punct:]]', ' '                              
                    )
                  )
                );
"""

In [118]:
tdf = td_db.execute_query(cleaning_query)

### Translating Products 

In [3]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,product_name
0,40837,ريب اى استيك
1,18749,بسمة ملوخية جم
2,0,مونتانا ذره ج
3,40838,americana beans with ghee g
4,18750,فرزه بطاطس مجمده
...,...,...
34238,18744,الامير ارز بسمتي سيلا ذهبي
34239,40834,تونه قطعه جم روز
34240,18745,مكرونة مرمريه جم
34241,40835,rehana ginger powder g


In [4]:
model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [5]:
products = df["product_name"].tolist()
batch_size = 16
translations = []
for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_translations = [model.translate(p) for p in batch]
    translations.extend(batch_translations)

In [None]:
df["translated_name"] = translations
df = df[["id", "translated_name"]]

In [8]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

In [15]:
df.to_csv(DATA_PATH / "train_cleaned.csv")

### Insert Class Names in DB

In [16]:
df = pd.read_csv(TRAIN_VAL_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [17]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df_classes = pd.DataFrame({"class_name": df_class})

In [None]:
df_classes["id"] = df_classes.index
df_classes = df_classes[["id", "class_name"]]

In [19]:
copy_to_sql(df_classes, "classes", "demo_user", if_exists="replace")

In [None]:
df_classes

In [23]:
df_actual_class = df[["class"]]
df_actual_class.rename(columns={'class': 'class_name'}, inplace=True)
df_actual_class["product_id"] = df_actual_class.index
df_actual_class = df_actual_class[["product_id", "class_name"]]

In [24]:
copy_to_sql(df_actual_class, "actual_classes", "demo_user", if_exists="replace")

### Cleaning the Class Names

In [25]:
cleaning_query = """
UPDATE demo_user.classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [26]:
td_db.execute_query(cleaning_query)

[]

In [27]:
tdf = td_db.execute_query("Select * from demo_user.classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,class_name
0,0,vegetables fruits
1,1,tea coffee hot drinks
2,2,bakery
3,3,soft drinks juices
4,4,tins jars packets
5,5,biscuits cakes
6,6,beef processed meat
7,7,chocolates sweets desserts
8,8,poultry
9,9,sauces dressings condiments


In [28]:
cleaning_query = """
UPDATE demo_user.actual_classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [29]:
td_db.execute_query(cleaning_query)

[]

In [30]:
tdf = td_db.execute_query("Select * from demo_user.actual_classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,product_id,class_name
0,0,vegetables fruits
1,1,tea coffee hot drinks
2,2,bakery
3,3,vegetables fruits
4,4,soft drinks juices
...,...,...
42944,42944,soft drinks juices
42945,42945,beef processed meat
42946,42946,sauces dressings condiments
42947,42947,bakery


### Create Product Embeddings

In [3]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,translated_name
0,28486,Rich Pastrama
1,40837,Reb i Estek
2,8019,Americana bean with tahina gm
3,18749,Basma Molokhiya
4,28487,Regina Pasta Spaghetti
...,...,...
34238,18744,Prince Basmati's rice is golden.
34239,40834,Tuna Jam Rose
34240,18745,Marmaris Jam Pasta
34241,40835,Rehana Ginger Powder


In [4]:
model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
products = df["translated_name"].tolist()
embeddings = model.get_embeddings(products)
# batch_size = 24
# embeddings = []

# for i in range(0, len(products), batch_size):
#     batch = products[i:i+batch_size]
#     batch_embeddings = model.get_embeddings(batch)  # returns tensors
#     batch_embeddings = [emb.tolist() for emb in batch_embeddings]  # convert to list
#     embeddings.extend(batch_embeddings)

In [6]:
len(embeddings)

34243

In [7]:
df["embeddings"] = embeddings
len(df["embeddings"][0])

1024

In [8]:
df = df[["id", "embeddings"]]
df.to_csv(PRODUCT_TRAIN_EMBEDDINGS_PATH)

### Insert Product Embeddings in DB (Directly from CSV)

In [3]:
df = pd.read_csv(PRODUCT_TRAIN_EMBEDDINGS_PATH)

df["embeddings"] = df["embeddings"].apply(ast.literal_eval)

In [6]:
emb_cols = pd.DataFrame(df["embeddings"].to_list(), columns=[f"embed_{i}" for i in range(len(df["embeddings"][0]))])

In [7]:
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)
df_expanded

Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,28486,-0.053040,-0.132202,-0.546600,-0.071391,0.240397,0.567464,-0.529419,0.006307,0.296672,...,0.008301,-0.243713,-0.080861,0.046697,0.644694,0.240397,-0.742839,-0.181641,-0.526489,-0.165690
1,40837,0.314960,-0.313672,-0.429980,0.322302,0.014648,0.145654,0.038904,-0.072412,-0.376093,...,-0.278992,-0.265942,-0.265717,-0.017139,0.372803,0.046375,-0.997900,0.779590,0.382916,-0.658105
2,8019,-0.610509,0.149785,-1.286412,0.140034,-0.028373,0.385945,-0.341570,0.367379,-0.032806,...,-0.136544,-0.412519,-0.307626,0.481463,0.155788,0.058036,-0.664551,0.367222,0.329080,-0.489223
3,18749,-0.372854,0.337285,-0.839183,-0.433854,0.174530,-0.265279,-0.180115,-0.096375,-0.149292,...,-0.539795,-0.118022,-0.370951,0.169769,0.365662,0.140876,0.004018,0.943339,-0.207314,-0.676741
4,28487,-0.300743,-0.211040,-0.422302,-0.019791,-0.047844,0.274445,-0.146683,-0.401855,0.315201,...,-0.553650,-0.715851,0.387726,0.267700,0.255501,-0.458557,-0.453208,0.195370,-0.282562,-0.467407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34238,18744,-0.247701,0.248315,-1.584452,0.120605,0.131185,0.837212,0.308244,0.238973,0.080504,...,-0.094910,-0.473728,-0.019653,-0.411872,-0.003798,0.050388,-0.230252,-0.002987,-0.571214,-0.730760
34239,40834,-0.183273,-0.094780,-0.553833,0.208431,0.206909,0.296570,-0.070648,-0.304901,0.109421,...,0.343018,-0.294449,-0.224854,0.533112,0.357544,0.767090,-0.502319,-0.529236,-0.507324,-0.379807
34240,18745,-0.047754,-1.160211,-1.533820,-0.272998,0.022885,0.088623,0.081934,0.154150,0.200488,...,-0.616455,0.165356,0.056958,0.852167,0.060156,0.315503,-0.900391,0.636816,-0.000488,0.029370
34241,40835,0.005699,0.026611,-0.754150,0.081802,0.187469,0.046196,0.049423,0.140182,0.128983,...,-0.649109,-0.100024,-0.343079,0.075012,0.626526,-0.877930,0.207153,-0.032074,-0.715683,-0.662292


In [8]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user", if_exists="replace")

### Insert Class Embeddings in DB (Directly from CSV)

In [9]:
df = pd.read_csv(TRAIN_VAL_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [10]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [12]:
model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
classes = df["class"].tolist()
embeddings = model.get_embeddings(classes)
# batch_size = 16
# embeddings = []
# for i in range(0, len(classes), batch_size):
#     batch = classes[i:i+batch_size]
#     batch_translations = model.get_embeddings(batch)
#     embeddings.extend(batch_translations)

In [16]:
embeddings = embeddings.tolist()
emb_cols = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(len(embeddings[0]))])

In [17]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [18]:
copy_to_sql(df_expanded, "c_embeddings", "demo_user", if_exists="replace")

## IN-DB Similiraty

In [3]:
RESULT_TABLE = "demo_user.similiratiy_score"

In [4]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [5]:
td_db.execute_query(q)

[]

In [6]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"embed_{i}" for i in range(1024)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'embed_{i}'" for i in range(1024)])

In [7]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS product_id,
        o.Reference_ID AS class_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT id, {vector_cols} FROM p_embeddings) AS TargetTable
        ON (SELECT id, {vector_cols} FROM c_embeddings) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id')
            RefIDColumn('id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    product_id,
    class_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(classification_sql)

[]

In [8]:
tdf = td_db.execute_query("SELECT * FROM demo_user.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf



item_id,closest_category_id,cosine_distance
40975,40,0.0676982350737747
40591,10,0.0984173398117638
40908,4,0.0892871241484905
40978,5,0.1393760431930724
40599,7,0.1152528486815445
40913,7,0.0925126953199664
40898,29,0.0929673268408762
40892,29,0.1203216993062795
40885,29,0.1740787123742769
40586,4,0.0914496400648782


In [9]:
results_query = f"""
SELECT
    p.translated_name AS product_name,
    c.class_name AS predicted_class,
    a.class_name AS actual_class,
    r.cosine_distance AS similarity_score
FROM {RESULT_TABLE} r
JOIN products p
    ON r.item_id = p.id
JOIN classes c
    ON r.closest_category_id = c.id
JOIN actual_classes a
    ON a.product_id = p.id;
"""

In [10]:
tdf = td_db.execute_query(results_query)
df = pd.DataFrame(tdf)

In [11]:
df

Unnamed: 0,product_name,predicted_class,actual_class,similarity_score
0,Miro Jum Milk Powder,jams spreads syrups,dairy eggs,0.105301
1,The miller's going through a bottle of nuts.,party supplies and gifts,nuts dates dried fruits,0.149492
2,Rich. Smoked chicken breasts.,nuts dates dried fruits,poultry,0.090157
3,Montana Corn C,beef processed meat,vegetables fruits,0.155347
4,Large box.,cleaning supplies,biscuits cakes,0.089915
...,...,...,...,...
34238,Corn Popcorn Bright Star Salted C,nuts dates dried fruits,chips crackers,0.111816
34239,Pepsi Cola Plastic Liter,soft drinks juices,soft drinks juices,0.114064
34240,Bresiden Cheese Cheddar Slices 1,jams spreads syrups,dairy eggs,0.079105
34241,Chloryl Gm,jams spreads syrups,cleaning supplies,0.149459


In [12]:
df.dropna(inplace=True)

In [13]:
from sklearn.metrics import f1_score

y_pred = df["predicted_class"].tolist()
y_true = df["actual_class"].tolist()

f1_score(y_true, y_pred, average="weighted")

0.10198909705571077

In [15]:
copy_to_sql(df, "results", "demo_user", if_exists="replace")

In [16]:
query = """
SELECT * FROM TD_ClassificationEvaluator (
   ON demo_user.results AS InputTable
   OUT PERMANENT TABLE OutputTable(classification_metrics)
   USING
       ObservationColumn('actual_class')
       PredictionColumn('predicted_class')
       Labels('vegetables fruits', 'tea coffee hot drinks', 'bakery', 'soft drinks juices', 'tins jars packets', 'biscuits cakes', 'beef processed meat', 'chocolates sweets desserts', 'poultry', 'sauces dressings condiments', 'nuts dates dried fruits', 'cleaning supplies', 'dairy eggs', 'chips crackers', 'water', 'rice pasta pulses', 'personal care skin body care', 'furniture', 'cooking ingredients', 'vegetables herbs', 'condiments dressings marinades', 'sweets desserts', 'laundry detergents', 'tea and coffee', 'disposables napkins', 'sugar home baking', 'perfumes deodorants', 'fruits', 'stationary', 'jams spreads syrups', 'baby care', 'home appliances', 'fish', 'breakfast cereals bars', 'hair shower bath soap', 'party supplies and gifts', 'wear', 'footwear', 'candles air fresheners', 'beef lamb meat', 'pets care', 'mobile tablets', 'dental care')
) AS dt;
"""
tdf = td_db.execute_query(query)


In [17]:
DataFrame(td_db.execute_query("select * from demo_user.classification_metrics"))



SeqNum,Metric,MetricValue,index_label
3,Micro-Recall,0.1334392374900714,2
5,Macro-Precision,0.0509171719935884,4
6,Macro-Recall,0.0347441596520388,5
7,Macro-F1,0.0221234880297309,6
9,Weighted-Recall,0.1334392374900714,8
10,Weighted-F1,0.1549303786833389,9
8,Weighted-Precision,0.4599321938560574,7
4,Micro-F1,0.1334392374900714,3
2,Micro-Precision,0.1334392374900714,1
1,Accuracy,0.1334392374900714,0


## Disconnect

In [None]:
td_db.disconnect()