# Import Libraries

In [1]:
import torch
import pandas as pd

import json

from utils import clean_text, remove_numbers, remove_punctuations, remove_strings, remove_stopwords, remove_special_chars, load_gpc_to_classes, load_embedding_model
from constants import (
    TEST_DATA_PATH,
    CLEANED_TEST_DATA_PATH,
    GPC_PATH,
    E5_LARGE_INSTRUCT_CONFIG_PATH,
    CLEANED_GPC_PATH,
    PRODUCT_EMBEDDINGS_PATH,
    CLASS_EMBEDDINGS_PATH,
    SIMILIRAITY_SCORES_PATH,
    DEVICE
)

# Prepare Cleaned Products File

## Load Data

In [2]:
df_test = pd.read_csv(TEST_DATA_PATH)
df_test = df_test[["Item_Name", "Brand"]]
df_test.head(5)

Unnamed: 0,Item_Name,Brand
0,Americana Okra zero 400 gm,Americana
1,ليمون اداليا 500 جم,
2,صلصه هاينز برطمان خصم عرض,هاينز
3,Dasani water 330ML,Dasani
4,بودرة عصير أناناس من سورس، 900 جم,سورس


## Remove Nulls

In [3]:
df_test.dropna(subset=["Item_Name"], inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4773 entries, 0 to 4772
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Item_Name  4773 non-null   object
 1   Brand      3827 non-null   object
dtypes: object(2)
memory usage: 74.7+ KB


## Normalize Data

## Clean Data

In [4]:
# df_train["removed_punctuations"] = df_train["Item_Name"].apply(remove_punctuations)
# df_train["removed_numbers"] = df_train["Item_Name"].apply(lambda x: remove_numbers(x, remove_string=False))
# df_train["removed_brand"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Brand]), axis=1)
# df_train["removed_pack"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Pack]), axis=1)
# df_train["removed_unit"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Unit]), axis=1)
# df_train["removed_numbers_and_punctuations"] = df_train["Item_Name"].apply(lambda x: remove_numbers(remove_punctuations(x), remove_string=False))
# df_train["removed_stopwords"] = df_train["Item_Name"].apply(remove_stopwords)
# df_train["class"] = df_train["class"].apply(remove_punctuations)
# df_train["cleaned_text"] = df_train.apply(clean_text, axis=1)
#df_test["class"] = df_test["class"].apply(remove_punctuations)
df_test["cleaned_text"] = df_test.apply(clean_text, axis=1)
df_test.head()

Unnamed: 0,Item_Name,Brand,cleaned_text
0,Americana Okra zero 400 gm,Americana,okra zero gm
1,ليمون اداليا 500 جم,,ليمون اداليا جم
2,صلصه هاينز برطمان خصم عرض,هاينز,صلصه برطمان خصم عرض
3,Dasani water 330ML,Dasani,water ml
4,بودرة عصير أناناس من سورس، 900 جم,سورس,بو عصير أناناس من جم


In [5]:
df_test = df_test[~(df_test["cleaned_text"]=="")]

In [6]:
df_test.reset_index(drop=False, inplace=True)  
df_test.rename(columns={"index": "id"}, inplace=True)

# Save Cleaned Data

In [7]:
df_test.head()

Unnamed: 0,id,Item_Name,Brand,cleaned_text
0,0,Americana Okra zero 400 gm,Americana,okra zero gm
1,1,ليمون اداليا 500 جم,,ليمون اداليا جم
2,2,صلصه هاينز برطمان خصم عرض,هاينز,صلصه برطمان خصم عرض
3,3,Dasani water 330ML,Dasani,water ml
4,4,بودرة عصير أناناس من سورس، 900 جم,سورس,بو عصير أناناس من جم


In [8]:
df_test = df_test[df_test["cleaned_text"]!="nan"]

In [9]:
df_test["id"] = df_test.index
df_test.to_csv(CLEANED_TEST_DATA_PATH, index=False, encoding="utf-8-sig")

# Prepare Cleaned Classes File

In [2]:
gpc_df = load_gpc_to_classes()
gpc_df["class_name"] = gpc_df["class_name"].apply(remove_special_chars)
gpc_df["description"] = gpc_df["description"].apply(remove_special_chars)

gpc_df.reset_index(drop=False, inplace=True)
gpc_df.rename(columns={"index": "id"}, inplace=True)

gpc_df.to_csv(CLEANED_GPC_PATH, index=False, encoding="utf-8-sig")

# Process Proudct Embeddings

In [3]:
product_df = pd.read_csv(CLEANED_TEST_DATA_PATH)

product_id = product_df["id"].tolist()
product_name = product_df["cleaned_text"].tolist()

In [4]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)
embeddings = model.get_embeddings(product_name)

In [5]:
embeddings = embeddings.tolist()
product_df["embeddings"] = embeddings

In [6]:
product_embedding_df = product_df[["id", "embeddings"]]
product_embedding_df.to_csv(PRODUCT_EMBEDDINGS_PATH, index=False, encoding="utf-8-sig")

# Process Class Embeddings

In [2]:
class_df = pd.read_csv(CLEANED_GPC_PATH)

class_id = class_df["id"].tolist()
class_name = class_df["class_name"].astype(str).tolist()

In [3]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)
embeddings = model.get_embeddings(class_name)

In [4]:
embeddings = embeddings.tolist()
class_df["embeddings"] = embeddings

In [5]:
class_embedding_df = class_df[["id", "embeddings"]]
class_embedding_df.to_csv(CLASS_EMBEDDINGS_PATH, index=False, encoding="utf-8-sig")

# Process Similraity Scores

In [2]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [3]:
product_df = pd.read_csv(CLEANED_TEST_DATA_PATH)
class_df = pd.read_csv(CLEANED_GPC_PATH)
product_embedding_df = pd.read_csv(PRODUCT_EMBEDDINGS_PATH)
class_embedding_df = pd.read_csv(CLASS_EMBEDDINGS_PATH)

In [4]:
# Merge names with embeddings
product_full = product_embedding_df.merge(product_df, on="id")
class_full = class_embedding_df.merge(class_df, on="id")

In [None]:
product_embeddings = [json.loads(embedding) for embedding in product_full["embeddings"].tolist()]
product_embeddings = torch.tensor(product_embeddings, dtype=torch.float16, device=DEVICE)

class_embeddings = [json.loads(embedding) for embedding in class_full["embeddings"].tolist()]
class_embeddings = torch.tensor(class_embeddings, dtype=torch.float16, device=DEVICE)

In [6]:
scores = model.calculate_scores(product_embeddings, class_embeddings)

In [7]:
cls_idx = scores.argmax(dim=1)
cls_idx

tensor([139449,  54224,  28424,  ..., 102924,  25847, 110683], device='cuda:0')

In [8]:
product_id = product_full["id"].tolist()
product_name = [product_full[product_full["id"]==idx]["Item_Name"].iloc[0] for idx, _ in enumerate(cls_idx)]

class_id = class_full["id"].tolist()
class_name = [class_full[class_full["id"]==idx.cpu().item()]["class_name"].iloc[0] for idx in cls_idx]

In [14]:
idx = 5
class_name[idx], product_name[idx]

('lock padlock', 'بسكو مصر لوكس 6 قطعه علبه 12')

In [None]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)
scores = []
for _, p in product_full.iterrows():
    for _, c in class_full.iterrows():
        score = model.calculate_scores([p["embeddings"]], [c["embeddings"]]).item()
        scores.append({
            "Product_Name": p["cleaned_text"],
            "Class_Name": c["class_name"],
            "Score": score
        })

  return forward_call(*args, **kwargs)


In [None]:
pd.DataFrame(scores).to_csv(SIMILIRAITY_SCORES_PATH, index=False)