# Import Libraries

In [1]:
import pandas as pd
from tqdm import tqdm

from utils import (
    clean_text, 
    remove_special_chars, 
    load_gpc_to_classes, 
    load_embedding_model,
    load_translation_model,
)
from constants import (
    TEST_DATA_PATH,
    TRAIN_VAL_DATA_PATH,
    CLEANED_TEST_DATA_PATH,
    CLEANED_TRAIN_DATA_PATH,
    E5_LARGE_INSTRUCT_CONFIG_PATH,
    CLEANED_GPC_PATH,
    PRODUCT_TEST_EMBEDDINGS_PATH,
    PRODUCT_TRAIN_EMBEDDINGS_PATH,
    CLASS_EMBEDDINGS_PATH,
    OPUS_TRANSLATION_CONFIG_PATH,
)

In [2]:
tqdm.pandas()

# Prepare Cleaned Products File

## Load Data

In [3]:
df_test = pd.read_csv(TEST_DATA_PATH)
df_train = pd.read_csv(TRAIN_VAL_DATA_PATH)

df_test = df_test[["Item_Name"]]
df_train = df_train[["Item_Name"]]

df_test.head(5)

Unnamed: 0,Item_Name
0,Americana Okra zero 400 gm
1,ليمون اداليا 500 جم
2,صلصه هاينز برطمان خصم عرض
3,Dasani water 330ML
4,بودرة عصير أناناس من سورس، 900 جم


## Remove Nulls

In [4]:
df_train.dropna(subset=["Item_Name"], inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42944 entries, 0 to 42948
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Item_Name  42944 non-null  object
dtypes: object(1)
memory usage: 671.0+ KB


In [5]:
df_test.dropna(subset=["Item_Name"], inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4773 entries, 0 to 4772
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Item_Name  4773 non-null   object
dtypes: object(1)
memory usage: 37.4+ KB


## Translate Data

In [6]:
translate_model = load_translation_model(OPUS_TRANSLATION_CONFIG_PATH)

In [7]:
df_train["translated_text"] = df_train["Item_Name"].progress_apply(translate_model.translate)

100%|██████████| 42944/42944 [1:07:31<00:00, 10.60it/s]


In [7]:
df_test["translated_text"] = df_test["Item_Name"].progress_apply(translate_model.translate)

100%|██████████| 4773/4773 [07:18<00:00, 10.89it/s]


## Clean Data

In [None]:
df_train["cleaned_text"] = df_train["translated_text"].apply(clean_text)
df_train.head()

In [8]:
df_test["cleaned_text"] = df_test["translated_text"].apply(clean_text)
df_test.head()

Unnamed: 0,Item_Name,translated_text,cleaned_text
0,Americana Okra zero 400 gm,Americana Okra zero 400 gm,americana okra zero gm
1,ليمون اداليا 500 جم,Lemon Adalia 500 gm,lemon adalia gm
2,صلصه هاينز برطمان خصم عرض,Heinz Bartman Sauce Discount Offer,heinz bartman sauce discount offer
3,Dasani water 330ML,Dasani water 330ML,dasani water ml
4,بودرة عصير أناناس من سورس، 900 جم,"Soros Pineapple Juice Powder, 900g",soros pineapple juice powder g


In [9]:
# df_train = df_train[~(df_train["cleaned_text"]=="")]
df_test = df_test[~(df_test["cleaned_text"]=="")]

In [10]:
df_test.reset_index(drop=False, inplace=True)  
df_test.rename(columns={"index": "id"}, inplace=True)

# df_train.reset_index(drop=False, inplace=True)  
# df_train.rename(columns={"index": "id"}, inplace=True)

# Save Cleaned Data

In [11]:
df_test = df_test[df_test["cleaned_text"]!="nan"]
# df_train = df_train[df_train["cleaned_text"]!="nan"]

In [30]:
df_train["id"] = df_train.index
df_train.to_csv(CLEANED_TRAIN_DATA_PATH, index=False, encoding="utf-8-sig")

In [12]:
df_test["id"] = df_test.index
df_test.to_csv(CLEANED_TEST_DATA_PATH, index=False, encoding="utf-8-sig")

# Prepare Cleaned Classes File

In [33]:
gpc_df = load_gpc_to_classes()
gpc_df["class_name"] = gpc_df["class_name"].progress_apply(remove_special_chars)
gpc_df["class_name_cleaned"] = gpc_df["class_name_cleaned"].progress_apply(remove_special_chars)
gpc_df["description"] = gpc_df["description"].progress_apply(remove_special_chars)

gpc_df.reset_index(drop=False, inplace=True)
gpc_df.rename(columns={"index": "id"}, inplace=True)

gpc_df.to_csv(CLEANED_GPC_PATH, index=False, encoding="utf-8-sig")

100%|██████████| 185440/185440 [00:00<00:00, 584432.74it/s]
100%|██████████| 185440/185440 [00:00<00:00, 670479.48it/s]
100%|██████████| 185440/185440 [00:00<00:00, 278338.64it/s]


# Process Proudct Embeddings

In [14]:
product_test_df = pd.read_csv(CLEANED_TEST_DATA_PATH)
product_train_df = pd.read_csv(CLEANED_TRAIN_DATA_PATH)

product_test_id = product_test_df["id"].tolist()
product_test_name = product_test_df["cleaned_text"].tolist()

product_train_id = product_train_df["id"].tolist()
product_train_name = product_train_df["cleaned_text"].tolist()

In [15]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)

In [16]:
# train_embeddings = model.get_embeddings(product_train_name)
test_embeddings = model.get_embeddings(product_test_name)

In [17]:
# train_embeddings = train_embeddings.tolist()
test_embeddings = test_embeddings.tolist()

# product_train_df["embeddings"] = train_embeddings
product_test_df["embeddings"] = test_embeddings

In [18]:
# product_train_embedding_df = product_train_df[["id", "embeddings"]]
product_test_embedding_df = product_test_df[["id", "embeddings"]]

# product_train_embedding_df.to_csv(PRODUCT_TRAIN_EMBEDDINGS_PATH, index=False, encoding="utf-8-sig")
product_test_embedding_df.to_csv(PRODUCT_TEST_EMBEDDINGS_PATH, index=False, encoding="utf-8-sig")

# Process Class Embeddings

In [39]:
class_df = pd.read_csv(CLEANED_GPC_PATH)

class_id = class_df["id"].tolist()
class_name = class_df["class_name_cleaned"].astype(str).tolist()

In [40]:
model = load_embedding_model(E5_LARGE_INSTRUCT_CONFIG_PATH)
embeddings = model.get_embeddings(class_name)

In [41]:
embeddings = embeddings.tolist()
class_df["embeddings"] = embeddings

In [42]:
class_embedding_df = class_df[["id", "embeddings"]]
class_embedding_df.to_csv(CLASS_EMBEDDINGS_PATH, index=False, encoding="utf-8-sig")