# Import packages

In [1]:
import pandas as pd
from teradataml import *
import ast
import json
import torch
from teradataml.dataframe.copy_to import copy_to_sql
from transformers.generation import GenerationMixin

from modules.db import TeradataDatabase
from modules.models import( 
    OpusTranslationModelConfig, 
                        OpusTranslationModel, 
                        SentenceEmbeddingConfig, 
                        SentenceEmbeddingModel, 
)
from utils import clean_text, load_embedding_model, unicode_clean, load_translation_model
from constants import (
    CLEANED_TEST_DATA_PATH, TRAIN_VAL_DATA_PATH, CLASS_EMBEDDINGS_PATH, PRODUCT_TEST_EMBEDDINGS_PATH, 
    CLEANED_GPC_PATH, CLEANED_TEST_DATA_PATH, TEST_DATA_PATH, E5_LARGE_INSTRUCT_CONFIG_PATH, 
    OPUS_TRANSLATION_CONFIG_PATH, DATA_PATH, PRODUCT_TRAIN_EMBEDDINGS_PATH, QWEN3_8B_CONFIG_PATH, VALIDATION_DATA_PATH,
    FULL_DATA_SET_DATA_PATH, PRODUCT_FULL_DATASET_EMBEDDINGS_PATH,  CLEANED_FULL_DATASET_DATA_PATH,
    CLASS_EMBEDDINGS_PATH_QWEN, PRODUCT_FULL_DATASET_EMBEDDINGS__QWEN_PATH, TD_DB

)


## Connect to database

In [2]:
td_db = TeradataDatabase()
td_db.connect()

### Combine Dataset into 1 file

In [14]:
df_train = pd.read_csv(TRAIN_VAL_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_valid = pd.read_csv(VALIDATION_DATA_PATH)

NameError: name 'VALIDATION_DATA_PATH' is not defined

In [5]:
full_df = pd.concat([df_train, df_test, df_valid], ignore_index=True)
full_df.to_csv(FULL_DATA_SET_DATA_PATH, index=False)

### Insert Product Names in DB

In [104]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,Americana Okra zero 400 gm,Vegetables & Fruits,Americana,400gm,1,,,,كيس,gm
1,ليمون اداليا 500 جم,"Tins, Jars & Packets",,500جم,1,,,,علبة,جم
2,صلصه هاينز برطمان خصم عرض,"Tins, Jars & Packets",هاينز,,1,,,,علبة,
3,Dasani water 330ML,Water,Dasani,330مل,1,,,,زجاجة,مل
4,بودرة عصير أناناس من سورس، 900 جم,Soft Drinks & Juices,سورس,900جم,1,,,,عبوة,جم


In [105]:
df.rename(columns={"Item_Name": "product_name"}, inplace=True)

In [106]:
df.drop_duplicates(subset=["product_name"], inplace=True)
df.dropna(subset=["product_name"], inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df = df[["id" , "product_name"]]

In [107]:
df['product_name'] = df['product_name'].apply(unicode_clean)

In [108]:
copy_to_sql(df, "products", "demo_user", if_exists="replace")

### Cleaning the Products

In [109]:
tdf = DataFrame.from_table("products", schema_name=TD_DB)
# tdf = td_db.execute_query("Select * from demo_user.products")
# tdf = DataFrame(tdf)

In [110]:
cleaning_query = """
UPDATE demo_user.products
SET product_name =
                  TRIM(
                    REGEXP_REPLACE(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(product_name, '[[:digit:]]+', ''), 
                        '[-_/\\|]', ''),                              
                      '[[:punct:]]', ''                              
                    )
                  )
                ;
"""

In [111]:
td_db.execute_query(cleaning_query)

[]

In [112]:
tdf = tdf.assign(
    product_name = tdf.product_name.str.lower()
    )

tdf_stripped = tdf.assign(
    product_name = tdf.product_name.str.strip()
    )        

In [113]:
copy_to_sql(tdf, "products", "demo_user", if_exists="replace")

### Translating Products 

In [114]:
tdf = DataFrame.from_table("products", schema_name=TD_DB)
df = tdf.to_pandas()
df

Unnamed: 0,id,product_name
0,0,americana okra zero gm
1,1,ليمون اداليا جم
2,2,صلصه هاينز برطمان خصم عرض
3,3,dasani water ml
4,4,بودرة عصير أناناس من سورس، جم
...,...,...
4568,4767,كادبورى شوكولاته كريسبللو ج
4569,4768,جهينه كوكتيل ل
4570,4769,دبل ديرى مشروب بنكهه ليمون مل
4571,4770,galaxy smooth milk chocolate x g


In [115]:
model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [116]:
products = df["product_name"].tolist()
batch_size = 32
translations = []
for i in range(0, len(products), batch_size):
    batch = products[i:i+batch_size]
    batch_translations = [model.translate(p) for p in batch]
    translations.extend(batch_translations)

In [117]:
df["translated_name"] = translations
df = df[["id", "translated_name"]]

In [118]:
copy_to_sql(df, "translated_products", "demo_user", if_exists="replace")

In [119]:
df.to_csv(DATA_PATH / "cleaned_full_dataset.csv")

### Insert Class Names in DB

In [159]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,Americana Okra zero 400 gm,Vegetables & Fruits,Americana,400gm,1,,,,كيس,gm
1,ليمون اداليا 500 جم,"Tins, Jars & Packets",,500جم,1,,,,علبة,جم
2,صلصه هاينز برطمان خصم عرض,"Tins, Jars & Packets",هاينز,,1,,,,علبة,
3,Dasani water 330ML,Water,Dasani,330مل,1,,,,زجاجة,مل
4,بودرة عصير أناناس من سورس، 900 جم,Soft Drinks & Juices,سورس,900جم,1,,,,عبوة,جم


In [157]:
df_class = df["class"].dropna().unique()
df_classes = pd.DataFrame({"class_name": df_class})

In [142]:
df_classes["id"] = df_classes.index
df_classes = df_classes[["id", "class_name"]]

In [143]:
copy_to_sql(df_classes, "classes", "demo_user", if_exists="replace")

In [144]:
df_classes

Unnamed: 0,id,class_name
0,0,Vegetables & Fruits
1,1,"Tins, Jars & Packets"
2,2,Water
3,3,Soft Drinks & Juices
4,4,Biscuits & Cakes
5,5,"Tea, Coffee & Hot Drinks"
6,6,"Rice, Pasta & Pulses"
7,7,Cleaning Supplies
8,8,Bakery
9,9,Cooking Ingredients


In [160]:
df_actual_class = df[["class"]]
df_actual_class.rename(columns={'class': 'class_name'}, inplace=True)
df_actual_class["product_id"] = df_actual_class.index
df_actual_class = df_actual_class[["product_id", "class_name"]]

In [169]:
copy_to_sql(df_actual_class, "actual_classes", "demo_user", if_exists="replace")

### Cleaning the Class Names

In [162]:
tdf = DataFrame.from_table("classes", schema_name=TD_DB)

In [128]:
cleaning_query = """
UPDATE demo_user.classes
SET class_name =
                  TRIM(
                    REGEXP_REPLACE(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[[:digit:]]+', ''), 
                        '[-_/\\|]', ''),                              
                      '[[:punct:]]', ''                              
                    )
                  )
                ;
"""

In [129]:
td_db.execute_query(cleaning_query)

[]

In [131]:
tdf = tdf.assign(
    class_name = tdf.class_name.str.lower()
    )

tdf_stripped = tdf.assign(
    class_name = tdf.class_name.str.strip()
    )        

In [132]:
copy_to_sql(tdf, "classes", "demo_user", if_exists="replace")

In [163]:
cleaning_query = """
UPDATE demo_user.actual_classes
SET class_name = 
                  TRIM(
                      REGEXP_REPLACE(
                        REGEXP_REPLACE(class_name, '[^\w\s]', ' '),                                
                      '[-_/\\|]', ' ')                           
                  );

"""

In [164]:
td_db.execute_query(cleaning_query)

[]

In [165]:
tdf = tdf.assign(
    class_name = tdf.class_name.str.lower()
    )

tdf_stripped = tdf.assign(
    class_name = tdf.class_name.str.strip()
    )        

In [166]:
copy_to_sql(tdf, "actual_classes", "demo_user", if_exists="replace")

In [170]:
tdf = td_db.execute_query("Select * from demo_user.actual_classes")
df = pd.DataFrame(tdf)
df

Unnamed: 0,product_id,class_name
0,0,Vegetables & Fruits
1,1,"Tins, Jars & Packets"
2,2,"Tins, Jars & Packets"
3,3,Water
4,4,Soft Drinks & Juices
...,...,...
4768,4768,Soft Drinks & Juices
4769,4769,Soft Drinks & Juices
4770,4770,"Chocolates, Sweets & Desserts"
4771,4771,"Sauces, Dressings & Condiments"


### Create Product Embeddings

In [171]:
tdf = DataFrame.from_table("translated_products", schema_name=TD_DB)
df = tdf.to_pandas()
df

Unnamed: 0,id,translated_name
0,0,americana okra zero gm
1,1,Lemon Adalia
2,2,Heinz Bartman Sauce Discount Offer
3,3,Dasani water ml
4,4,"Pineapple Juice Powder from Soros, gm"
...,...,...
4568,4767,Cadbury Chocolate Crisplow
4569,4768,Juhayna Cocktail
4570,4769,Double Dairy Lemonade
4571,4770,galaxy smooth milk chocolate x g


In [172]:
# model = load_falcon3_embedding_model(FALCON3_7B_CONFIG_PATH)
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [173]:
products = df["translated_name"].tolist()
embeddings = model.get_embeddings(products)
# batch_size = 26
# embeddings = []

# for i in range(0, len(products), batch_size):
#     batch = products[i:i+batch_size]
#     batch_embeddings = model.get_embeddings(batch)

#     # Ensure each batch is a proper list of lists
#     if hasattr(batch_embeddings, "tolist"):
#         batch_embeddings = batch_embeddings.tolist()

#     embeddings.extend(batch_embeddings)

Batches:   0%|          | 0/143 [00:00<?, ?it/s]

In [174]:
df["embeddings"] = embeddings.tolist() #remove the .tolist in case of QWEN
len(df["embeddings"][0])

1024

In [175]:
df = df[["id", "embeddings"]]
df.to_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

### Insert Product Embeddings in DB (Directly from CSV)

In [176]:
df = pd.read_csv(PRODUCT_FULL_DATASET_EMBEDDINGS_PATH)

df["embeddings"] = df["embeddings"].apply(ast.literal_eval)

In [177]:
emb_cols = pd.DataFrame(df["embeddings"].to_list(), columns=[f"embed_{i}" for i in range(len(df["embeddings"][0]))])

In [178]:
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)
df_expanded

Unnamed: 0,id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0,0.013188,0.020378,-0.002300,-0.054269,0.011009,-0.023781,-0.013544,0.037452,0.021889,...,-0.031499,-0.020469,0.013406,-0.034482,0.032201,0.045320,0.001038,-0.015150,-0.043390,0.023568
1,1,0.000913,0.044237,0.008715,-0.064764,0.023550,-0.032385,-0.023913,0.038957,0.024926,...,-0.033956,-0.030539,0.015139,-0.014355,0.030280,0.025164,-0.000427,-0.029246,-0.057859,0.015447
2,2,0.016676,0.018593,-0.022479,-0.039902,0.030382,-0.008170,0.000748,0.039143,0.043819,...,-0.034079,-0.052360,0.025754,-0.028733,0.005298,0.018353,-0.007540,-0.031751,-0.049910,0.016198
3,3,0.016552,0.025449,-0.006817,-0.056296,0.003788,-0.034491,-0.002836,0.022980,0.029146,...,-0.036514,-0.044036,0.025941,-0.035547,0.025945,0.044669,0.009634,-0.024075,-0.030797,0.017691
4,4,0.032512,0.034133,-0.028527,-0.023061,0.008294,-0.031860,-0.009080,0.065644,0.043994,...,-0.033818,-0.056441,0.017332,-0.017292,0.015602,0.035395,-0.007652,-0.011969,-0.059368,-0.000100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4568,4767,0.009798,0.031787,0.011036,-0.077842,0.019153,-0.014723,-0.005221,0.060963,0.022137,...,-0.018525,-0.034274,0.007236,-0.025346,0.027386,0.012861,0.011611,-0.028133,-0.044855,0.024929
4569,4768,0.011698,0.033627,-0.000560,-0.039218,0.005534,0.004735,-0.010160,0.059395,0.029578,...,-0.019567,-0.032597,0.021086,-0.030845,0.007493,0.024988,-0.005461,-0.015058,-0.044411,0.048753
4570,4769,0.006801,0.026302,0.008740,-0.039210,0.006213,-0.012053,-0.023825,0.056800,0.032229,...,-0.034754,-0.041202,0.038180,-0.037435,0.028802,0.026444,0.001772,-0.007644,-0.044737,0.047548
4571,4770,0.011489,0.033557,-0.004621,-0.058418,0.027675,-0.021438,-0.034241,0.036658,0.033401,...,-0.025512,-0.046632,0.015605,-0.033435,0.018552,0.021750,0.006347,-0.013429,-0.033729,0.024975


In [179]:
copy_to_sql(df_expanded, "p_embeddings", "demo_user", if_exists="replace")

### Insert Class Embeddings in DB (Directly from CSV)

In [180]:
df = pd.read_csv(TEST_DATA_PATH)
df.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,Americana Okra zero 400 gm,Vegetables & Fruits,Americana,400gm,1,,,,كيس,gm
1,ليمون اداليا 500 جم,"Tins, Jars & Packets",,500جم,1,,,,علبة,جم
2,صلصه هاينز برطمان خصم عرض,"Tins, Jars & Packets",هاينز,,1,,,,علبة,
3,Dasani water 330ML,Water,Dasani,330مل,1,,,,زجاجة,مل
4,بودرة عصير أناناس من سورس، 900 جم,Soft Drinks & Juices,سورس,900جم,1,,,,عبوة,جم


In [181]:
df_class = df["class"].dropna().unique()
# df = df.reset_index(drop=False) 
# df.rename(columns={'index': 'id'}, inplace=True)
df = pd.DataFrame({"class": df_class})

In [182]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [183]:
classes = df["class"].tolist()
embeddings = model.get_embeddings(classes)
# batch_size = 26
# embeddings = []

# for i in range(0, len(products), batch_size):
#     batch = products[i:i+batch_size]
#     batch_embeddings = model.get_embeddings(batch)

#     # Ensure each batch is a proper list of lists
#     if hasattr(batch_embeddings, "tolist"):
#         batch_embeddings = batch_embeddings.tolist()

#     embeddings.extend(batch_embeddings)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [184]:
embeddings = embeddings.tolist()
emb_cols = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(len(embeddings[0]))])

In [185]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df_expanded = pd.concat([df[['id']], emb_cols], axis=1)

In [186]:
copy_to_sql(df_expanded, "c_embeddings", "demo_user", if_exists="replace")

### In-DB Similiraity

In [187]:
RESULT_TABLE = "demo_user.similiratiy_score"

In [190]:
# Create the result table
q = f"""
    CREATE TABLE {RESULT_TABLE} (
        item_id BIGINT,
        closest_category_id BIGINT,
        cosine_distance FLOAT
    );
    """

In [191]:
td_db.execute_query(q)

[]

In [192]:
# Generate the string of vector columns for 1024 dimensions
vector_cols = ", ".join([f"embed_{i}" for i in range(1024)])

# Generate the quoted string for feature columns
vector_cols_quoted = ", ".join([f"'embed_{i}'" for i in range(1024)])

In [193]:
classification_sql = f"""
INSERT INTO {RESULT_TABLE}
WITH RankedDistances AS (
    SELECT
        o.Target_ID AS product_id,
        o.Reference_ID AS class_id,
        o.Distance,
        ROW_NUMBER() OVER (PARTITION BY o.Target_ID ORDER BY o.Distance ASC) as rn
    FROM TD_VectorDistance (
        ON (SELECT id, {vector_cols} FROM p_embeddings) AS TargetTable
        ON (SELECT id, {vector_cols} FROM c_embeddings) AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id')
            RefIDColumn('id')
            TargetFeatureColumns({vector_cols_quoted})
            RefFeatureColumns({vector_cols_quoted})
            DistanceMeasure('cosine')
    ) AS o
)
SELECT
    product_id,
    class_id,
    Distance
FROM RankedDistances
WHERE rn = 1;
"""
td_db.execute_query(classification_sql)

[]

In [194]:
tdf = td_db.execute_query("SELECT * FROM demo_user.similiratiy_score")
tdf = DataFrame(tdf, False)
tdf



item_id,closest_category_id,cosine_distance
0,31,0.139783659452218
11,10,0.1181549479028765
18,8,0.147032502754974
1,31,0.1322058824904213
17,11,0.1179319839223537
23,1,0.1251050758861031
10,1,0.1165338996780606
3,2,0.1082948021519601
8,28,0.1563936197276643
2,12,0.1397556609337787


In [195]:
results_query = f"""
SELECT
    p.translated_name AS product_name,
    c.class_name AS predicted_class,
    a.class_name AS actual_class,
    r.cosine_distance AS similarity_score
FROM {RESULT_TABLE} r
JOIN translated_products p
    ON r.item_id = p.id
JOIN classes c
    ON r.closest_category_id = c.id
JOIN actual_classes a
    ON a.product_id = p.id;
"""

In [196]:
tdf = td_db.execute_query(results_query)
df = pd.DataFrame(tdf)

In [197]:
df

Unnamed: 0,product_name,predicted_class,actual_class,similarity_score
0,Lemon Adalia,"Jams, Spreads & Syrups","Tins, Jars & Packets",0.132206
1,Candlestick Chocolate Pure Hazelnut,"Chocolates, Sweets & Desserts",Biscuits & Cakes,0.111815
2,Lubya Baladi Jam,"Jams, Spreads & Syrups","Rice, Pasta & Pulses",0.144473
3,americana okra zero gm,"Jams, Spreads & Syrups",Vegetables & Fruits,0.139784
4,Dasani water ml,Water,Water,0.108295
...,...,...,...,...
4568,morganic pasta sauce italian bolognese g,"Rice, Pasta & Pulses","Tins, Jars & Packets",0.144159
4569,Foody fava beans plain k,Vegetables & Fruits,"Tins, Jars & Packets",0.129441
4570,Saw chicken hips.,Poultry,Poultry,0.160542
4571,Almarai Whole Milk L Plastic,Dairy & Eggs,Dairy & Eggs,0.135945


In [198]:
df.dropna(inplace=True)

In [199]:
from sklearn.metrics import f1_score

y_pred = df["predicted_class"].tolist()
y_true = df["actual_class"].tolist()

f1_score(y_true, y_pred, average="weighted")

0.4227343325695598

## Disconnect

In [3]:
td_db.disconnect()