# Import packages

In [1]:
import pandas as pd
from teradataml import *
import ast
import json
import torch
from teradataml.dataframe.copy_to import copy_to_sql
from transformers.generation import GenerationMixin

from modules.db import TeradataDatabase
from modules.models import( 
    OpusTranslationModelConfig, 
                        OpusTranslationModel, 
                        SentenceEmbeddingConfig, 
                        SentenceEmbeddingModel, 
)
from utils import clean_text, load_embedding_model, unicode_clean, load_translation_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, QWEN3_8B_CONFIG_PATH, VALIDATION_DATA_PATH,
    FULL_DATA_SET_DATA_PATH, PRODUCT_FULL_DATASET_EMBEDDINGS_PATH,  CLEANED_FULL_DATASET_DATA_PATH,
    CLASS_EMBEDDINGS_PATH_QWEN, PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH
)


Logger initialized. All logs will be saved to: c:\Users\ss255385\OneDrive - Teradata Corporation\Desktop\AMuRD-Iteration-7\src\logs\borai_20250821_073600.log


## Connect to database

In [2]:
td_db = TeradataDatabase()
td_db.connect()

### Combine Dataset into 1 file

In [4]:
df_train = pd.read_csv(TRAIN_VAL_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_valid = pd.read_csv(VALIDATION_DATA_PATH)

In [5]:
full_df = pd.concat([df_train, df_test, df_valid], ignore_index=True)
full_df.to_csv(FULL_DATA_SET_DATA_PATH, index=False)

### Insert Product Names in DB

In [21]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [22]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [23]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df = df[["id" , "product_name"]]

In [24]:
df['product_name'] = df['product_name'].apply(unicode_clean)

In [25]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Cleaning the Products

In [27]:
cleaning_query = """
UPDATE demo_user.products
SET product_name = LOWER(
                  TRIM(
                    REGEXP_REPLACE(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(product_name, '[[:digit:]]+', ''), 
                        '[-_/\\|]', ' '),                              
                      '[[:punct:]]', ' '                              
                    )
                  )
                );
"""

In [28]:
tdf = td_db.execute_query(cleaning_query)

### Translating Products 

In [29]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,product_name
0,0,مونتانا ذره ج
1,18749,بسمة ملوخية جم
2,1,ahmad tea fruit and herb selection herbal teab...
3,18750,فرزه بطاطس مجمده
4,2,lulu brown samoon pkt
...,...,...
37579,47715,تيفانى بريك ريزو وافر ج
37580,47716,كادبورى شوكولاته كريسبللو ج
37581,47717,جهينه كوكتيل ل
37582,47718,دبل ديرى مشروب بنكهه ليمون مل


In [30]:
model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [31]:
products = df["product_name"].tolist()
batch_size = 32
translations = []
for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_translations = [model.translate(p) for p in batch]
    translations.extend(batch_translations)

In [32]:
df["translated_name"] = translations
df = df[["id", "translated_name"]]

In [33]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

In [34]:
df.to_csv(DATA_PATH / "cleaned_full_dataset.csv")

### Insert Class Names in DB

In [35]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [36]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df_classes = pd.DataFrame({"class_name": df_class})

In [37]:
df_classes["id"] = df_classes.index
df_classes = df_classes[["id", "class_name"]]

In [38]:
copy_to_sql(df_classes, "classes", "demo_user", if_exists="replace")

In [None]:
df_classes

In [40]:
df_actual_class = df[["class"]]
df_actual_class.rename(columns={'class': 'class_name'}, inplace=True)
df_actual_class["product_id"] = df_actual_class.index
df_actual_class = df_actual_class[["product_id", "class_name"]]

In [41]:
copy_to_sql(df_actual_class, "actual_classes", "demo_user", if_exists="replace")

### Cleaning the Class Names

In [42]:
cleaning_query = """
UPDATE demo_user.classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [43]:
td_db.execute_query(cleaning_query)

[]

In [None]:
tdf = td_db.execute_query("Select * from demo_user.classes")
df = pd.DataFrame(tdf)
df

In [45]:
cleaning_query = """
UPDATE demo_user.actual_classes
SET class_name = LOWER(
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  )
                );

"""

In [46]:
td_db.execute_query(cleaning_query)

[]

In [47]:
tdf = td_db.execute_query("Select * from demo_user.actual_classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,product_id,class_name
0,0,vegetables fruits
1,49149,rice pasta pulses
2,32766,rice pasta pulses
3,1,tea coffee hot drinks
4,49150,poultry
...,...,...
52489,32761,sauces dressings condiments
52490,32762,biscuits cakes
52491,32763,soft drinks juices
52492,32764,sauces dressings condiments


### Create Product Embeddings

In [4]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,translated_name
0,0,Montana Corn C
1,40837,Reb i Estek
2,18749,Basma Molokhiya
3,40838,Americana Beans with ghee g
4,1,AHMAD TEA FRUIT AND HERB SELECTION HERBAL TEAB...
...,...,...
37579,40833,Fragile disc.
37580,18745,Marmaris Jam Pasta
37581,40834,Tuna Jam Rose
37582,18747,Tiger kebab potato chips gm


In [5]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(QWEN3_8B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
products = df["translated_name"].tolist()
#embeddings = model.get_embeddings(products)
batch_size = 26
embeddings = []

for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_embeddings = model.get_embeddings(batch)

    # Ensure each batch is a proper list of lists
    if hasattr(batch_embeddings, "tolist"):
        batch_embeddings = batch_embeddings.tolist()

    embeddings.extend(batch_embeddings)

In [9]:
len(embeddings)

37584

In [10]:
df["embeddings"] = embeddings
len(df["embeddings"][0])

1024

In [11]:
df = df[["id", "embeddings"]]
df.to_csv(PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH)

### Insert Product Embeddings in DB (Directly from CSV)

In [3]:
df = pd.read_csv(PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH)

df["embeddings"] = df["embeddings"].apply(ast.literal_eval)

In [4]:
emb_cols = pd.DataFrame(df["embeddings"].to_list(), columns=[f"embed_{i}" for i in range(len(df["embeddings"][0]))])

In [5]:
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)
df_expanded

Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0,0.012344,0.006161,-0.018646,-0.020218,-0.001552,0.011688,-0.007679,-0.007652,-0.022522,...,0.006191,0.009636,-0.008743,0.032715,0.006668,-0.015121,-0.012833,-0.032349,-0.001556,-0.002279
1,40837,0.024628,-0.003304,0.009079,-0.013863,0.003790,-0.035004,-0.014099,0.016968,-0.024094,...,-0.020660,-0.026016,-0.024033,0.022476,-0.020020,0.000424,-0.003355,-0.018936,0.025742,0.018143
2,18749,0.003706,0.002958,-0.005295,-0.016113,0.028046,-0.024719,-0.008781,-0.010818,-0.022888,...,-0.010147,-0.000367,-0.029938,0.035706,-0.006306,-0.014809,-0.027359,-0.020782,0.018539,0.012825
3,40838,0.007210,0.006744,0.001098,-0.018723,0.028763,-0.022308,-0.010925,-0.014313,-0.019958,...,-0.001131,-0.011620,-0.021408,0.009987,-0.001512,0.000864,-0.022202,-0.013329,0.019424,0.011902
4,1,0.037048,0.003113,0.019226,-0.005077,0.017014,-0.018524,0.009758,-0.000047,-0.035675,...,-0.012062,-0.010818,-0.034912,0.006943,-0.006466,0.010063,-0.007858,0.003706,0.004852,0.018448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37579,40833,0.013557,0.006855,0.029587,-0.008141,0.011787,0.000709,-0.031525,-0.018875,0.010353,...,-0.015854,-0.002268,-0.001523,0.014160,-0.020081,0.001575,-0.002394,-0.019547,0.013763,0.003395
37580,18745,-0.004459,-0.014320,-0.016632,-0.019394,0.040497,-0.016281,0.008377,-0.020935,-0.018234,...,-0.024979,0.010574,-0.033875,0.020172,-0.017624,-0.009651,-0.025864,-0.001453,0.021042,0.005985
37581,40834,0.011436,-0.005024,-0.002378,-0.012390,0.028519,-0.006451,-0.014412,-0.012283,-0.000492,...,-0.028336,0.003662,-0.018463,0.015884,-0.020905,-0.013550,-0.027359,-0.010254,0.016144,0.004478
37582,18747,-0.004284,0.006493,0.010925,-0.001613,0.038544,-0.031372,-0.002254,-0.027328,-0.027100,...,-0.025970,0.005634,-0.035156,0.033173,-0.014275,-0.000684,-0.013115,0.002888,0.014488,-0.000720


In [None]:
df_expanded.to_csv(CLASS_EMBEDDINGS_PATH_QWEN)

In [3]:
df_expanded = pd.read_csv(CLASS_EMBEDDINGS_PATH_QWEN)

In [4]:
df_expanded

Unnamed: 0.1,Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0,0,0.012344,0.006161,-0.018646,-0.020218,-0.001552,0.011688,-0.007679,-0.007652,...,0.006191,0.009636,-0.008743,0.032715,0.006668,-0.015121,-0.012833,-0.032349,-0.001556,-0.002279
1,1,40837,0.024628,-0.003304,0.009079,-0.013863,0.003790,-0.035004,-0.014099,0.016968,...,-0.020660,-0.026016,-0.024033,0.022476,-0.020020,0.000424,-0.003355,-0.018936,0.025742,0.018143
2,2,18749,0.003706,0.002958,-0.005295,-0.016113,0.028046,-0.024719,-0.008781,-0.010818,...,-0.010147,-0.000367,-0.029938,0.035706,-0.006306,-0.014809,-0.027359,-0.020782,0.018539,0.012825
3,3,40838,0.007210,0.006744,0.001098,-0.018723,0.028763,-0.022308,-0.010925,-0.014313,...,-0.001131,-0.011620,-0.021408,0.009987,-0.001512,0.000864,-0.022202,-0.013329,0.019424,0.011902
4,4,1,0.037048,0.003113,0.019226,-0.005077,0.017014,-0.018524,0.009758,-0.000047,...,-0.012062,-0.010818,-0.034912,0.006943,-0.006466,0.010063,-0.007858,0.003706,0.004852,0.018448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37579,37579,40833,0.013557,0.006855,0.029587,-0.008141,0.011787,0.000709,-0.031525,-0.018875,...,-0.015854,-0.002268,-0.001523,0.014160,-0.020081,0.001575,-0.002394,-0.019547,0.013763,0.003395
37580,37580,18745,-0.004459,-0.014320,-0.016632,-0.019394,0.040497,-0.016281,0.008377,-0.020935,...,-0.024979,0.010574,-0.033875,0.020172,-0.017624,-0.009651,-0.025864,-0.001453,0.021042,0.005985
37581,37581,40834,0.011436,-0.005024,-0.002378,-0.012390,0.028519,-0.006451,-0.014412,-0.012283,...,-0.028336,0.003662,-0.018463,0.015884,-0.020905,-0.013550,-0.027359,-0.010254,0.016144,0.004478
37582,37582,18747,-0.004284,0.006493,0.010925,-0.001613,0.038544,-0.031372,-0.002254,-0.027328,...,-0.025970,0.005634,-0.035156,0.033173,-0.014275,-0.000684,-0.013115,0.002888,0.014488,-0.000720


In [None]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user")

### Insert Class Embeddings in DB (Directly from CSV)

In [None]:
df = pd.read_csv(DATA_PATH / "full_dataset.csv")
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [4]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [5]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [6]:
classes = df["class"].tolist()
embeddings = model.get_embeddings(classes)
# batch_size = 16
# embeddings = []
# for i in range(0, len(classes), batch_size):
#     batch = classes[i:i+batch_size]
#     batch_translations = model.get_embeddings(batch)
#     embeddings.extend(batch_translations)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
embeddings = embeddings.tolist()
emb_cols = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(len(embeddings[0]))])

In [8]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [9]:
copy_to_sql(df_expanded, "c_embeddings", "demo_user", if_exists="replace")

### In-DB Similiraity

In [None]:
RESULT_TABLE = "demo_user.similiratiy_score"

In [None]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [None]:
td_db.execute_query(q)

[]

In [None]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"embed_{i}" for i in range(1024)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'embed_{i}'" for i in range(1024)])

In [None]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS product_id,
        o.Reference_ID AS class_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT id, {vector_cols} FROM p_embeddings) AS TargetTable
        ON (SELECT id, {vector_cols} FROM c_embeddings) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id')
            RefIDColumn('id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    product_id,
    class_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(classification_sql)

[]

In [None]:
tdf = td_db.execute_query("SELECT * FROM demo_user.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf



item_id,closest_category_id,cosine_distance
0,31,0.139783659452218
11,10,0.1181549479028765
18,8,0.147032502754974
1,31,0.1322058824904213
17,11,0.1179319839223537
23,1,0.1251050758861031
10,1,0.1165338996780606
3,2,0.1082948021519601
8,28,0.1563936197276643
2,12,0.1397556609337787


In [None]:
results_query = f"""
SELECT
    p.translated_name AS product_name,
    c.class_name AS predicted_class,
    a.class_name AS actual_class,
    r.cosine_distance AS similarity_score
FROM {RESULT_TABLE} r
JOIN products p
    ON r.item_id = p.id
JOIN classes c
    ON r.closest_category_id = c.id
JOIN actual_classes a
    ON a.product_id = p.id;
"""

In [None]:
tdf = td_db.execute_query(results_query)
df = pd.DataFrame(tdf)

In [None]:
df

Unnamed: 0,product_name,predicted_class,actual_class,similarity_score
0,Lemon Adalia,jams spreads syrups,tins jars packets,0.132206
1,Candlestick Chocolate Pure Hazelnut,chocolates sweets desserts,biscuits cakes,0.111815
2,Lubya Baladi Jam,jams spreads syrups,rice pasta pulses,0.144473
3,americana okra zero gm,jams spreads syrups,vegetables fruits,0.139784
4,Dasani water ml,water,water,0.108295
...,...,...,...,...
4568,morganic pasta sauce italian bolognese g,rice pasta pulses,tins jars packets,0.144159
4569,Foody fava beans plain k,vegetables fruits,tins jars packets,0.129441
4570,Saw chicken hips.,poultry,poultry,0.160542
4571,Almarai Whole Milk L Plastic,dairy eggs,dairy eggs,0.135945


In [None]:
df.dropna(inplace=True)

In [None]:
from sklearn.metrics import f1_score

y_pred = df["predicted_class"].tolist()
y_true = df["actual_class"].tolist()

f1_score(y_true, y_pred, average="weighted")

0.4282552067718755

In [None]:
copy_to_sql(df, "results", "demo_user", if_exists="replace")

In [None]:
query = """
SELECT * FROM TD_ClassificationEvaluator (
   ON demo_user.results AS InputTable
   OUT PERMANENT TABLE OutputTable(classification_metrics)
   USING
       ObservationColumn('actual_class')
       PredictionColumn('predicted_class')
       Labels(
       'condiments dressings marinades', 'furniture', 'personal care skin body care', 'null', 'tea coffee hot Drinks', 'sweets desserts', 'hair shower bath soap', 'fruits', 'nuts dates dried fruits', 'vegetables fruits', 'home appliances', 'sauces dressings condiments', 'baby care', 'tea coffee', 'disposables napkins', 'tins jars packets', 'chips crackers', 'soft drinks juices', 'Cooking Ingredients', 'dairy eggs', 'bakery', 'vegetables herbs', 'biscuits cakes', 'candles air fresheners', 'water', 'rice pasta pulses', 'poultry', 'beef processed Meat', 'home textile', 'cleaning supplies', 'beef lamb meat', 'chocolates sweets desserts', 'jams spreads syrups')
) AS dt;
"""
tdf = td_db.execute_query(query)

In [None]:
DataFrame(td_db.execute_query("select * from demo_user.classification_metrics"))



SeqNum,Metric,MetricValue,index_label
3,Micro-Recall,0.929245283018868,2
5,Macro-Precision,0.1819870413753316,4
6,Macro-Recall,0.1800189513423839,5
7,Macro-F1,0.1659260044632161,6
9,Weighted-Recall,0.929245283018868,8
10,Weighted-F1,0.9345127519519588,9
8,Weighted-Precision,0.9565417613453052,7
4,Micro-F1,0.929245283018868,3
2,Micro-Precision,0.929245283018868,1
1,Accuracy,0.929245283018868,0


## Disconnect

In [None]:
td_db.disconnect()