# Import packages

In [1]:
import ast
import swifter
import pandas as pd
from tqdm import tqdm
from teradataml import *
from teradataml.dataframe.copy_to import copy_to_sql

from modules.db import TeradataDatabase
from queries import cleaning_query
from utils import load_embedding_model, unicode_clean, load_translation_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, QWEN3_8B_CONFIG_PATH, VALIDATION_DATA_PATH,
    FULL_DATA_SET_DATA_PATH, PRODUCT_FULL_DATASET_EMBEDDINGS_PATH,  CLEANED_FULL_DATASET_DATA_PATH,
    CLASS_EMBEDDINGS_PATH_QWEN, PRODUCT_FULL_DATASET_EMBEDDINGS_QWEN_PATH
)


Log directory cleaned: c:\internship\AMuRD-Iteration-7\src\logs
Logger initialized. All logs will be saved to: c:\internship\AMuRD-Iteration-7\src\logs\borai_20250821_140525.log


In [2]:
tqdm.pandas()

## Connect to database

In [3]:
td_db = TeradataDatabase()
td_db.connect()

### Combine Dataset into 1 file

In [5]:
df_train = pd.read_csv(TRAIN_VAL_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
# df_valid = pd.read_csv(VALIDATION_DATA_PATH)

In [6]:
# full_df = pd.concat([df_train, df_test, df_valid], ignore_index=True)
full_df = pd.concat([df_train, df_test], ignore_index=True)
full_df.to_csv(FULL_DATA_SET_DATA_PATH, index=False)

### Insert Product Names in DB

In [7]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [8]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [9]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df = df[["id" , "product_name"]]

In [10]:
df['product_name'] = df['product_name'].progress_apply(unicode_clean)

In [11]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Cleaning the Products

In [None]:
td_db.execute_query(cleaning_query("products", "product_name"))

[]

### Translating Products 

In [13]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,product_name
0,40837,ريب اى استيك
1,0,مونتانا ذره ج
2,40838,americana beans with ghee g
3,1,ahmad tea fruit and herb selection herbal teab...
4,40839,بيبسى مشروب غازى زجاجه
...,...,...
37579,40827,علبه مولد مقاس
37580,40829,رانى عصير مانجو كانز مل
37581,40833,قرص فسدقيه
37582,40834,تونه قطعه جم روز


In [14]:
model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [None]:
products = df["product_name"].tolist()
batch_size = 32
translations = []
for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_translations = [model.translSate(p) for p in batch]
    translations.extend(batch_translations)

In [None]:
df["translated_name"] = translations
df = df[["id", "translated_name"]]

In [16]:
df["translated_name"] = df["product_name"].swifter.apply(model.translate)

Pandas Apply:   0%|          | 0/37584 [00:00<?, ?it/s]

In [17]:
df

Unnamed: 0,id,product_name,translated_name
0,40837,ريب اى استيك,Reb i Estek
1,0,مونتانا ذره ج,Montana Corn C
2,40838,americana beans with ghee g,Americana Beans with ghee g
3,1,ahmad tea fruit and herb selection herbal teab...,AHMAD TEA FRUIT AND HERB SELECTION HERBAL TEAB...
4,40839,بيبسى مشروب غازى زجاجه,Pepsi is a bottled soda.
...,...,...,...
37579,40827,علبه مولد مقاس,Generator Box Size
37580,40829,رانى عصير مانجو كانز مل,Rani Mango Cans Juice
37581,40833,قرص فسدقيه,Fragile disc.
37582,40834,تونه قطعه جم روز,Tuna Jam Rose


In [18]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

In [19]:
df.to_csv(DATA_PATH / "cleaned_full_dataset.csv")

### Insert Class Names in DB

In [4]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,
...,...,...,...,...,...,...,...,...,...,...
47717,جهينه كوكتيل 1ل,Soft Drinks & Juices,جهينه,1ل,1,,,,زجاجة,ل
47718,دبل ديرى مشروب بنكهه ليمون 275مل,Soft Drinks & Juices,دبل ديرى,275مل,1,,,,زجاجة,مل
47719,Galaxy Smooth Milk Chocolate 24 x 36g,"Chocolates, Sweets & Desserts",Galaxy,36جم,24,,,,عبوة,جم
47720,Hot Ketchup Squeeze – 320g,"Sauces, Dressings & Condiments",,320g,1,,,,عبوة,g


In [5]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df_classes = pd.DataFrame({"class_name": df_class})

In [6]:
df_classes["id"] = df_classes.index
df_classes = df_classes[["id", "class_name"]]

In [7]:
copy_to_sql(df_classes, "classes", "demo_user", if_exists="replace")

In [8]:
df_classes

Unnamed: 0,id,class_name
0,0,Vegetables & Fruits
1,1,"Tea, Coffee & Hot Drinks"
2,2,Bakery
3,3,Soft Drinks & Juices
4,4,"Tins, Jars & Packets"
5,5,Biscuits & Cakes
6,6,Beef & Processed Meat
7,7,"Chocolates, Sweets & Desserts"
8,8,Poultry
9,9,"Sauces, Dressings & Condiments"


In [9]:
df_actual_class = df[["class"]]
df_actual_class.rename(columns={'class': 'class_name'}, inplace=True)
df_actual_class["product_id"] = df_actual_class.index
df_actual_class = df_actual_class[["product_id", "class_name"]]

In [10]:
copy_to_sql(df_actual_class, "actual_classes", "demo_user", if_exists="replace")

In [11]:
df_actual_class

Unnamed: 0,product_id,class_name
0,0,Vegetables & Fruits
1,1,"Tea, Coffee & Hot Drinks"
2,2,Bakery
3,3,Vegetables & Fruits
4,4,Soft Drinks & Juices
...,...,...
47717,47717,Soft Drinks & Juices
47718,47718,Soft Drinks & Juices
47719,47719,"Chocolates, Sweets & Desserts"
47720,47720,"Sauces, Dressings & Condiments"


### Cleaning the Class Names

In [12]:
td_db.execute_query(cleaning_query("classes", "class_name"))

[]

In [13]:
tdf = td_db.execute_query("Select * from demo_user.classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,class_name
0,0,vegetables fruits
1,1,tea coffee hot drinks
2,2,bakery
3,3,soft drinks juices
4,4,tins jars packets
5,5,biscuits cakes
6,6,beef processed meat
7,7,chocolates sweets desserts
8,8,poultry
9,9,sauces dressings condiments


In [14]:
td_db.execute_query(cleaning_query("actual_classes", "class_name"))

[]

In [15]:
tdf = td_db.execute_query("Select * from demo_user.actual_classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,product_id,class_name
0,0,vegetables fruits
1,16383,cleaning supplies
2,1,tea coffee hot drinks
3,16384,beef processed meat
4,2,bakery
...,...,...
47717,47717,soft drinks juices
47718,47718,soft drinks juices
47719,47719,chocolates sweets desserts
47720,47720,sauces dressings condiments


### Create Product Embeddings

In [4]:
tdf = td_db.execute_query("Select * from demo_user.products")
df = pd.DataFrame(tdf)
df

Unnamed: 0,id,product_name,translated_name
0,40837,ريب اى استيك,Reb i Estek
1,34097,دايم بانيه بارد,Cold-painted dime.
2,0,مونتانا ذره ج,Montana Corn C
3,34098,durra virgin olive oil ml,Durra virgin olive oil ml
4,40838,americana beans with ghee g,Americana Beans with ghee g
...,...,...,...
37579,34092,diet pepsi,Diet Pepsi
37580,34093,زجاجه زيت شعر مل لاستيك,A bottle of hairy oil is lastic.
37581,34094,melted cheese,Melted Cheese
37582,34095,مفروم بقري قليل الدسم,Low-fat cow's minced.


In [5]:
model = load_embedding_model(QWEN3_8B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
products = df["translated_name"].tolist()
embeddings = model.get_embeddings(products)

Batches:   0%|          | 0/1175 [00:00<?, ?it/s]

In [7]:
len(embeddings)

37584

In [8]:
df["embeddings"] = embeddings.tolist()
len(df["embeddings"][0])

2047

In [9]:
df = df[["id", "embeddings"]]
df.to_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

### Insert Product Embeddings in DB (Directly from CSV)

In [10]:
df = pd.read_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

df["embeddings"] = df["embeddings"].swifter.apply(ast.literal_eval)

Pandas Apply:   0%|          | 0/37584 [00:00<?, ?it/s]

In [11]:
emb_cols = pd.DataFrame(df["embeddings"].to_list(), columns=[f"embed_{i}" for i in range(len(df["embeddings"][0]))])

In [12]:
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)
df_expanded

Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_2037,embed_2038,embed_2039,embed_2040,embed_2041,embed_2042,embed_2043,embed_2044,embed_2045,embed_2046
0,40837,0.024628,-0.003292,0.009033,-0.013893,0.003750,-0.035126,-0.014145,0.016937,-0.024109,...,-0.018875,-0.012360,-0.002480,0.005974,0.025146,-0.010353,-0.010391,-0.000572,0.003572,-0.014473
1,34097,0.013153,0.019333,0.010612,-0.013878,-0.010658,0.008835,-0.006615,-0.008400,-0.031525,...,0.008110,-0.016159,-0.022858,0.007610,0.024445,0.000197,-0.000741,0.005756,0.003275,-0.015450
2,0,0.012367,0.006184,-0.018692,-0.020218,-0.001524,0.011734,-0.007679,-0.007679,-0.022552,...,0.017899,-0.001819,-0.006920,0.015327,0.012535,-0.010918,-0.009842,0.007504,-0.009186,0.003918
3,34098,0.011986,0.011475,-0.000491,-0.020203,0.035736,-0.037598,-0.007423,-0.016724,-0.033325,...,0.013359,-0.005318,-0.008972,-0.013885,0.015671,-0.015656,-0.003576,0.001235,0.008720,-0.001623
4,40838,0.007233,0.006783,0.001117,-0.018677,0.028748,-0.022263,-0.010918,-0.014313,-0.019958,...,0.006351,-0.006577,-0.008698,0.003632,0.019730,0.000304,-0.006420,0.013268,-0.016449,-0.006222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37579,34092,-0.008202,-0.000187,-0.000437,-0.005951,0.034149,-0.008850,-0.028763,-0.016235,-0.014610,...,-0.013084,-0.005249,-0.007004,0.024811,0.017181,0.001949,0.013535,0.012146,0.006767,-0.014252
37580,34093,0.002028,-0.007591,0.035339,0.025681,0.013565,-0.022751,-0.008324,-0.004944,-0.003431,...,0.000424,-0.012283,-0.012428,-0.001925,0.018341,-0.004295,-0.021042,-0.006779,0.004730,-0.009789
37581,34094,-0.006393,0.000750,-0.007851,0.008476,0.016434,-0.005798,-0.005123,-0.010757,-0.009506,...,-0.003717,-0.000853,-0.012604,0.003870,0.019592,-0.008682,0.001179,0.008362,0.002306,-0.012886
37582,34095,0.009796,0.009651,0.007153,0.011604,0.028488,0.001333,-0.000119,-0.034912,0.007496,...,-0.010597,-0.007881,0.001193,0.010933,0.022598,0.006023,-0.007172,0.006504,-0.010887,-0.004318


In [13]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user", if_exists="replace")

### Insert Class Embeddings in DB (Directly from CSV)

In [16]:
df = pd.read_csv(FULL_DATA_SET_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [17]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [18]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(QWEN3_8B_CONFIG_PATH)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
classes = df["class"].tolist()
embeddings = model.get_embeddings(classes)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
embeddings = embeddings.tolist()
emb_cols = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(len(embeddings[0]))])

In [21]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [22]:
copy_to_sql(df_expanded, "c_embeddings", "demo_user", if_exists="replace")

### In-DB Similiraity

In [23]:
RESULT_TABLE = "demo_user.similiratiy_score"

In [24]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [25]:
td_db.execute_query(q)

[]

In [26]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"embed_{i}" for i in range(1024)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'embed_{i}'" for i in range(1024)])

In [27]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS product_id,
        o.Reference_ID AS class_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT id, {vector_cols} FROM p_embeddings) AS TargetTable
        ON (SELECT id, {vector_cols} FROM c_embeddings) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id')
            RefIDColumn('id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    product_id,
    class_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(classification_sql)

[]

In [28]:
tdf = td_db.execute_query("SELECT * FROM demo_user.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf



item_id,closest_category_id,cosine_distance
0,31,0.3941303488018985
11,13,0.3753216841277768
18,15,0.4895851641730774
1,23,0.3403545182404879
17,10,0.3979370549015366
23,31,0.2720071879025786
10,23,0.2652934148967831
3,31,0.3003517338198356
8,31,0.2945302768926364
2,37,0.2573750707782266


In [29]:
results_query = f"""
SELECT
    p.translated_name AS product_name,
    c.class_name AS predicted_class,
    a.class_name AS actual_class,
    r.cosine_distance AS similarity_score
FROM {RESULT_TABLE} r
JOIN products p
    ON r.item_id = p.id
JOIN classes c
    ON r.closest_category_id = c.id
JOIN actual_classes a
    ON a.product_id = p.id;
"""

In [30]:
tdf = td_db.execute_query(results_query)
df = pd.DataFrame(tdf)

In [31]:
df

Unnamed: 0,product_name,predicted_class,actual_class,similarity_score
0,Miro Jum Milk Powder,dairy eggs,dairy eggs,0.285559
1,K K K K K K K K K K K K K K K K K K K K K K,wear,rice pasta pulses,0.403083
2,Rich. Smoked chicken breasts.,chips crackers,poultry,0.320701
3,Montana Corn C,chips crackers,vegetables fruits,0.394130
4,Large box.,wear,biscuits cakes,0.358127
...,...,...,...,...
37579,american garden natural popcorn microwave g,chips crackers,chips crackers,0.279856
37580,Schweppes tonic water ml,soft drinks juices,soft drinks juices,0.294433
37581,Rice K,rice pasta pulses,rice pasta pulses,0.355480
37582,Almarai Yogurt Drink Strawberry,soft drinks juices,dairy eggs,0.370586


In [32]:
df.dropna(inplace=True)

In [33]:
from sklearn.metrics import f1_score

y_pred = df["predicted_class"].tolist()
y_true = df["actual_class"].tolist()

f1_score(y_true, y_pred, average="weighted")

0.3070420100304256

In [34]:
copy_to_sql(df, "results", "demo_user", if_exists="replace")

In [35]:
query = """
SELECT * FROM TD_ClassificationEvaluator (
   ON demo_user.results AS InputTable
   OUT PERMANENT TABLE OutputTable(classification_metrics)
   USING
       ObservationColumn('actual_class')
       PredictionColumn('predicted_class')
       Labels('vegetables fruits', 'tea coffee hot drinks', 'bakery', 'soft drinks juices', 'tins jars packets', 'biscuits cakes', 'beef processed meat', 'chocolates sweets desserts', 'poultry', 'sauces dressings condiments', 'nuts dates dried fruits', 'cleaning supplies', 'dairy eggs', 'chips crackers', 'water', 'rice pasta pulses', 'personal care skin body care', 'furniture', 'cooking ingredients', 'vegetables herbs', 'condiments dressings marinades', 'sweets desserts', 'laundry detergents', 'tea and coffee', 'disposables napkins', 'sugar home baking', 'perfumes deodorants', 'fruits', 'stationary', 'jams spreads syrups', 'baby care', 'home appliances', 'fish', 'breakfast cereals bars', 'hair shower bath soap', 'party supplies and gifts', 'wear', 'footwear', 'candles air fresheners', 'beef lamb meat', 'pets care', 'mobile tablets', 'dental care')
) AS dt;
"""
tdf = td_db.execute_query(query)

In [36]:
DataFrame(td_db.execute_query("select * from demo_user.classification_metrics"))



SeqNum,Metric,MetricValue,index_label
3,Micro-Recall,0.2803583808863527,2
5,Macro-Precision,0.310650116031118,4
6,Macro-Recall,0.2344749440869364,5
7,Macro-F1,0.1426294619249019,6
9,Weighted-Recall,0.2803583808863527,8
10,Weighted-F1,0.3074355071589172,9
8,Weighted-Precision,0.713085427841726,7
4,Micro-F1,0.2803583808863527,3
2,Micro-Precision,0.2803583808863527,1
1,Accuracy,0.2803583808863527,0


## Disconnect

In [None]:
td_db.disconnect()