# Import packages

In [1]:
import pandas as pd
from teradataml import *
import ast
import json
import torch
from teradataml.dataframe.copy_to import copy_to_sql
from transformers.generation import GenerationMixin

from modules.db import TeradataDatabase
from modules.models import( 
    OpusTranslationModelConfig, 
                        OpusTranslationModel, 
                        SentenceEmbeddingConfig, 
                        SentenceEmbeddingModel, 
)
from utils import clean_text, load_embedding_model, unicode_clean, load_translation_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, QWEN3_8B_CONFIG_PATH, VALIDATION_DATA_PATH,
    FULL_DATA_SET_DATA_PATH, PRODUCT_FULL_DATASET_EMBEDDINGS_PATH,  CLEANED_FULL_DATASET_DATA_PATH,
    CLASS_EMBEDDINGS_PATH_QWEN, PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH
)


Logger initialized. All logs will be saved to: c:\Users\ss255385\OneDrive - Teradata Corporation\Desktop\AMuRD-Iteration-7\src\logs\borai_20250824_130654.log


## Connect to database

In [2]:
td_db = TeradataDatabase()
td_db.connect()

### Combine Dataset into 1 file

In [4]:
df_train = pd.read_csv(TRAIN_VAL_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_valid = pd.read_csv(VALIDATION_DATA_PATH)

In [5]:
full_df = pd.concat([df_train, df_test, df_valid], ignore_index=True)
full_df.to_csv(FULL_DATA_SET_DATA_PATH, index=False)

### Insert Product Names in DB

In [3]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,Americana Okra zero 400 gm,Vegetables & Fruits,Americana,400gm,1,,,,كيس,gm
1,ليمون اداليا 500 جم,"Tins, Jars & Packets",,500جم,1,,,,علبة,جم
2,صلصه هاينز برطمان خصم عرض,"Tins, Jars & Packets",هاينز,,1,,,,علبة,
3,Dasani water 330ML,Water,Dasani,330مل,1,,,,زجاجة,مل
4,بودرة عصير أناناس من سورس، 900 جم,Soft Drinks & Juices,سورس,900جم,1,,,,عبوة,جم


In [4]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [5]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df = df[["id" , "product_name"]]

In [6]:
df['product_name'] = df['product_name'].apply(unicode_clean)

In [7]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Cleaning the Products

In [8]:
cleaning_query = """
UPDATE demo_user.products
SET product_name = LOWER(
                  TRIM(
                    REGEXP_REPLACE(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(product_name, '[[:digit:]]+', ''), 
                        '[-_/\\|]', ' '),                              
                      '[[:punct:]]', ' '                              
                    )
                  )
                );
"""

In [9]:
tdf = td_db.execute_query(cleaning_query)

### Translating Products 

In [10]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,product_name
0,0,americana okra zero gm
1,1,ليمون اداليا جم
2,2,صلصه هاينز برطمان خصم عرض
3,3,dasani water ml
4,4,بودرة عصير أناناس من سورس، جم
...,...,...
4568,4767,كادبورى شوكولاته كريسبللو ج
4569,4768,جهينه كوكتيل ل
4570,4769,دبل ديرى مشروب بنكهه ليمون مل
4571,4770,galaxy smooth milk chocolate x g


In [11]:
model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [12]:
products = df["product_name"].tolist()
batch_size = 32
translations = []
for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_translations = [model.translate(p) for p in batch]
    translations.extend(batch_translations)

KeyboardInterrupt: 

In [None]:
df["translated_name"] = translations
df = df[["id", "translated_name"]]

In [None]:
copy_to_sql(df, "translated_products_2", "demo_user", if_exists="replace")

In [None]:
df.to_csv(DATA_PATH / "cleaned_full_dataset.csv")

### Insert Class Names in DB

In [None]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [None]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df_classes = pd.DataFrame({"class_name": df_class})

In [None]:
df_classes["id"] = df_classes.index
df_classes = df_classes[["id", "class_name"]]

In [None]:
copy_to_sql(df_classes, "classes", "demo_user", if_exists="replace")

In [None]:
df_classes

Unnamed: 0,id,class_name
0,0,Vegetables & Fruits
1,1,"Tea, Coffee & Hot Drinks"
2,2,Bakery
3,3,Soft Drinks & Juices
4,4,"Tins, Jars & Packets"
5,5,Biscuits & Cakes
6,6,Beef & Processed Meat
7,7,"Chocolates, Sweets & Desserts"
8,8,Poultry
9,9,"Sauces, Dressings & Condiments"


In [None]:
df_actual_class = df[["class"]]
df_actual_class.rename(columns={'class': 'class_name'}, inplace=True)
df_actual_class["product_id"] = df_actual_class.index
df_actual_class = df_actual_class[["product_id", "class_name"]]

In [None]:
copy_to_sql(df_actual_class, "actual_classes", "demo_user", if_exists="replace")

### Cleaning the Class Names

In [None]:
cleaning_query = """
UPDATE demo_user.classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [None]:
td_db.execute_query(cleaning_query)

[]

In [44]:
tdf = td_db.execute_query("Select * from demo_user.classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,class_name
0,0,vegetables fruits
1,1,tea coffee hot drinks
2,2,bakery
3,3,soft drinks juices
4,4,tins jars packets
5,5,biscuits cakes
6,6,beef processed meat
7,7,chocolates sweets desserts
8,8,poultry
9,9,sauces dressings condiments


In [None]:
cleaning_query = """
UPDATE demo_user.actual_classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [None]:
td_db.execute_query(cleaning_query)

[]

In [None]:
tdf = td_db.execute_query("Select * from demo_user.actual_classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,product_id,class_name
0,0,vegetables fruits
1,49149,rice pasta pulses
2,32766,rice pasta pulses
3,1,tea coffee hot drinks
4,49150,poultry
...,...,...
52489,32761,sauces dressings condiments
52490,32762,biscuits cakes
52491,32763,soft drinks juices
52492,32764,sauces dressings condiments


### Create Product Embeddings

In [48]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,translated_name
0,0,Montana Corn C
1,40837,Reb i Estek
2,18749,Basma Molokhiya
3,40838,Americana Beans with ghee g
4,1,AHMAD TEA FRUIT AND HERB SELECTION HERBAL TEAB...
...,...,...
37579,40833,Fragile disc.
37580,18745,Marmaris Jam Pasta
37581,40834,Tuna Jam Rose
37582,18747,Tiger kebab potato chips gm


In [49]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [None]:
products = df["translated_name"].tolist()
embeddings = model.get_embeddings(products)
# batch_size = 26
# embeddings = []

# for i in range(0, len(products), batch_size):
#     batch = products[i:i+batch_size]
#     batch_embeddings = model.get_embeddings(batch)

#     # Ensure each batch is a proper list of lists
#     if hasattr(batch_embeddings, "tolist"):
#         batch_embeddings = batch_embeddings.tolist()

#     embeddings.extend(batch_embeddings)

Batches:   0%|          | 0/1175 [00:00<?, ?it/s]

In [51]:
len(embeddings)

37584

In [None]:
df["embeddings"] = embeddings.tolist() #remove the .tolist in case of QWEN
len(df["embeddings"][0])

1024

In [53]:
df = df[["id", "embeddings"]]
df.to_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

### Insert Product Embeddings in DB (Directly from CSV)

In [9]:
df = pd.read_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

df["embeddings"] = df["embeddings"].apply(ast.literal_eval)

In [11]:
emb_cols = pd.DataFrame(df["embeddings"].to_list(), columns=[f"embed_{i}" for i in range(len(df["embeddings"][0]))])

In [12]:
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)
df_expanded

Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0,0.015328,0.039254,-0.003427,-0.037174,0.006399,-0.043421,-0.010520,0.065841,0.015048,...,-0.029317,-0.012436,-0.017701,-0.027084,0.021784,0.057254,0.006207,-0.015104,-0.029992,0.046426
1,40837,0.029369,0.050631,0.001408,-0.055559,-0.008562,-0.009442,0.005418,0.018761,0.006816,...,-0.004117,-0.020857,0.000761,-0.010358,0.029784,0.029281,-0.000801,-0.029878,-0.024836,0.045829
2,18749,0.030012,0.014037,0.013602,-0.053966,-0.001614,-0.008398,0.015482,0.014098,0.028291,...,-0.030414,-0.007876,0.017925,-0.008461,0.018188,0.047249,0.018842,-0.011374,-0.035489,0.009465
3,40838,0.036557,0.044264,0.001074,-0.026326,0.020214,-0.029555,-0.007647,0.053449,0.013849,...,-0.033112,-0.020942,0.017518,-0.046515,0.026689,0.037485,-0.020751,-0.029199,-0.041687,0.034662
4,1,0.042021,0.029750,-0.029577,-0.036600,0.030576,-0.050543,-0.027985,0.035200,0.034652,...,-0.052281,-0.079466,0.016898,-0.028278,0.013814,0.037772,0.025222,-0.044246,-0.055822,0.016314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37579,40833,0.025366,0.027520,-0.007039,-0.026696,0.037357,-0.009309,0.000706,0.076981,0.042368,...,-0.024959,-0.027232,0.030310,-0.012734,0.033812,0.036596,0.043399,-0.030106,-0.046717,0.015050
37580,18745,0.031433,0.040032,0.011296,-0.035078,0.006681,0.009141,-0.001431,0.023426,0.014119,...,-0.020247,-0.037832,0.021014,-0.007632,0.030130,0.032414,-0.010519,-0.021812,-0.041907,0.022395
37581,40834,0.029359,0.036676,-0.016532,-0.037347,0.001596,-0.029049,-0.011044,0.032673,0.041653,...,-0.035220,-0.044813,-0.003143,0.009704,0.031135,0.037403,-0.001130,-0.032121,-0.046928,0.012263
37582,18747,0.019306,0.034287,-0.002833,-0.042215,0.011933,-0.015229,-0.016382,0.012368,0.037919,...,-0.046862,-0.024158,0.020916,-0.040049,0.002285,0.004963,0.017478,-0.029646,-0.049897,0.004193


In [13]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user", if_exists="replace")

### Insert Class Embeddings in DB (Directly from CSV)

In [3]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [4]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [5]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(QWEN3_8B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
classes = df["class"].tolist()
embeddings = model.get_embeddings(classes)
# batch_size = 26
# embeddings = []

# for i in range(0, len(products), batch_size):
#     batch = products[i:i+batch_size]
#     batch_embeddings = model.get_embeddings(batch)

#     # Ensure each batch is a proper list of lists
#     if hasattr(batch_embeddings, "tolist"):
#         batch_embeddings = batch_embeddings.tolist()

#     embeddings.extend(batch_embeddings)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
embeddings = embeddings.tolist()
emb_cols = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(len(embeddings[0]))])

In [8]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [9]:
copy_to_sql(df_expanded, "c_embeddings", "demo_user", if_exists="replace")

### In-DB Similiraity

In [14]:
RESULT_TABLE = "demo_user.similiratiy_score"

In [15]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [16]:
td_db.execute_query(q)

[]

In [17]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"embed_{i}" for i in range(1024)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'embed_{i}'" for i in range(1024)])

In [18]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS product_id,
        o.Reference_ID AS class_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT id, {vector_cols} FROM p_embeddings) AS TargetTable
        ON (SELECT id, {vector_cols} FROM c_embeddings) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id')
            RefIDColumn('id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    product_id,
    class_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(classification_sql)

[]

In [19]:
tdf = td_db.execute_query("SELECT * FROM demo_user.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf



item_id,closest_category_id,cosine_distance
40975,29,0.1205179493569461
40591,27,0.1703934885290955
40908,4,0.1340998811806526
40978,29,0.1807162745071704
40599,8,0.1060485209092646
40913,23,0.1059207557462144
2,29,0.156726484369557
8,29,0.1187015151153825
0,34,0.1367709430101142
3,29,0.1244321793402392


In [20]:
results_query = f"""
SELECT
    p.translated_name AS product_name,
    c.class_name AS predicted_class,
    a.class_name AS actual_class,
    r.cosine_distance AS similarity_score
FROM {RESULT_TABLE} r
JOIN products p
    ON r.item_id = p.id
JOIN classes c
    ON r.closest_category_id = c.id
JOIN actual_classes a
    ON a.product_id = p.id;
"""

In [21]:
tdf = td_db.execute_query(results_query)
df = pd.DataFrame(tdf)

In [22]:
df

Unnamed: 0,product_name,predicted_class,actual_class,similarity_score
0,Miro Jum Milk Powder,dairy eggs,dairy eggs,0.115456
1,K K K K K K K K K K K K K K K K K K K K K K,jams spreads syrups,rice pasta pulses,0.188884
2,Rich. Smoked chicken breasts.,poultry,poultry,0.157644
3,Montana Corn C,breakfast cereals bars,vegetables fruits,0.136771
4,Large box.,tins jars packets,biscuits cakes,0.151940
...,...,...,...,...
37579,american garden natural popcorn microwave g,chips crackers,chips crackers,0.149057
37580,Schweppes tonic water ml,soft drinks juices,soft drinks juices,0.109199
37581,Rice K,rice pasta pulses,rice pasta pulses,0.090696
37582,Almarai Yogurt Drink Strawberry,jams spreads syrups,dairy eggs,0.119715


In [23]:
df.dropna(inplace=True)

In [24]:
from sklearn.metrics import f1_score

y_pred = df["predicted_class"].tolist()
y_true = df["actual_class"].tolist()

f1_score(y_true, y_pred, average="weighted")

0.40987826992596044

In [25]:
copy_to_sql(df, "results", "demo_user", if_exists="replace")

In [8]:
query = """
SELECT * FROM TD_ClassificationEvaluator (
   ON demo_user.results AS InputTable
   OUT PERMANENT TABLE OutputTable(classification_metrics)
   USING
       ObservationColumn('actual_class')
       PredictionColumn('predicted_class')
       Labels('vegetables fruits', 'tea coffee hot drinks', 'bakery', 'soft drinks juices', 'tins jars packets', 'biscuits cakes', 'beef processed meat', 'chocolates sweets desserts', 'poultry', 'sauces dressings condiments', 'nuts dates dried fruits', 'cleaning supplies', 'dairy eggs', 'chips crackers', 'water', 'rice pasta pulses', 'personal care skin body care', 'furniture', 'cooking ingredients', 'vegetables herbs', 'condiments dressings marinades', 'sweets desserts', 'laundry detergents', 'tea and coffee', 'disposables napkins', 'sugar home baking', 'perfumes deodorants', 'fruits', 'stationary', 'jams spreads syrups', 'baby care', 'home appliances', 'fish', 'breakfast cereals bars', 'hair shower bath soap', 'party supplies and gifts', 'wear', 'footwear', 'candles air fresheners', 'beef lamb meat', 'pets care', 'mobile tablets', 'dental care')
) AS dt;
"""
tdf = td_db.execute_query(query)

In [27]:
DataFrame(td_db.execute_query("select * from demo_user.classification_metrics"))



SeqNum,Metric,MetricValue,index_label
3,Micro-Recall,0.9144640234948604,2
5,Macro-Precision,0.1405900984380863,4
6,Macro-Recall,0.1936424228423527,5
7,Macro-F1,0.1443059543468214,6
9,Weighted-Recall,0.9144640234948604,8
10,Weighted-F1,0.9229879935355104,9
8,Weighted-Precision,0.9391799719595768,7
4,Micro-F1,0.9144640234948604,3
2,Micro-Precision,0.9144640234948604,1
1,Accuracy,0.9144640234948604,0


## Disconnect

In [None]:
td_db.disconnect()