# Import Libraries

In [3]:
import pandas as pd

from utils import clean_text, remove_numbers, remove_punctuations, remove_strings, remove_stopwords
from evaluation import multi_column_embedding_model_evaluation
from constants import (
    TEST_DATA_PATH,
    CLEANED_TEST_DATA_PATH,
    E5_LARGE_INSTRUCT_CONFIG_PATH,
    TRAIN_VAL_DATA_PATH,
    CLASSES_TRANSLATION,
    CLEANED_TRAIN_DATA_PATH,
)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\os255022\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Prepare Data

## Load Data

In [4]:
df_train = pd.read_csv(".." / TRAIN_VAL_DATA_PATH)
df_test = pd.read_csv(".." / TEST_DATA_PATH)
df_train.head(5)

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,Vegetables & Fruits,مونتانا,400جم,1,,,,كيس,جم
1,Ahmad Tea Fruit And Herb Selection Herbal Teab...,"Tea, Coffee & Hot Drinks",Ahmad Tea,,20,,,,علبة,
2,Lulu Brown Samoon 1pkt,Bakery,Lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,Vegetables & Fruits,,,1,,,,كيس,
4,كانز,Soft Drinks & Juices,,,1,,,,كانز,


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42949 entries, 0 to 42948
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Item_Name        42944 non-null  object
 1   class            42938 non-null  object
 2   Brand            34485 non-null  object
 3   Weight           27267 non-null  object
 4   Number of units  42929 non-null  object
 5   Size of units    419 non-null    object
 6   Price            2398 non-null   object
 7   T.Price          2391 non-null   object
 8   Pack             41563 non-null  object
 9   Unit             27641 non-null  object
dtypes: object(10)
memory usage: 3.3+ MB


## Remove Nulls

In [6]:
df_train.dropna(subset=["Item_Name", "class"], inplace=True)
df_test.dropna(subset=["Item_Name", "class"], inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42935 entries, 0 to 42948
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Item_Name        42935 non-null  object
 1   class            42935 non-null  object
 2   Brand            34483 non-null  object
 3   Weight           27260 non-null  object
 4   Number of units  42917 non-null  object
 5   Size of units    419 non-null    object
 6   Price            2397 non-null   object
 7   T.Price          2390 non-null   object
 8   Pack             41550 non-null  object
 9   Unit             27634 non-null  object
dtypes: object(10)
memory usage: 3.6+ MB


## Normalize Data

In [7]:
df_train["Item_Name"] = df_train["Item_Name"].str.lower()
df_train["class"] = df_train["class"].str.lower()
df_train["Unit"] = df_train["Unit"].str.lower()
df_train["Weight"] = df_train["Weight"].str.lower()
df_train["Brand"] = df_train["Brand"].str.lower()
df_test["Item_Name"] = df_test["Item_Name"].str.lower()
df_test["class"] = df_test["class"].str.lower()
df_test["Unit"] = df_test["Unit"].str.lower()
df_test["Weight"] = df_test["Weight"].str.lower()
df_test["Brand"] = df_test["Brand"].str.lower()
df_train.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit
0,مونتانا ذره 400 ج,vegetables & fruits,مونتانا,400جم,1,,,,كيس,جم
1,ahmad tea fruit and herb selection herbal teab...,"tea, coffee & hot drinks",ahmad tea,,20,,,,علبة,
2,lulu brown samoon 1pkt,bakery,lulu,,1,,,,عبوة,
3,فلفل رومى بلدى حشو وزن,vegetables & fruits,,,1,,,,كيس,
4,كانز,soft drinks & juices,,,1,,,,كانز,


## Clean Data

In [8]:
# df_train["removed_punctuations"] = df_train["Item_Name"].apply(remove_punctuations)
# df_train["removed_numbers"] = df_train["Item_Name"].apply(lambda x: remove_numbers(x, remove_string=False))
# df_train["removed_brand"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Brand]), axis=1)
# df_train["removed_pack"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Pack]), axis=1)
# df_train["removed_unit"] = df_train.apply(lambda x: remove_strings(x.Item_Name, [x.Unit]), axis=1)
# df_train["removed_numbers_and_punctuations"] = df_train["Item_Name"].apply(lambda x: remove_numbers(remove_punctuations(x), remove_string=False))
# df_train["removed_stopwords"] = df_train["Item_Name"].apply(remove_stopwords)
df_train["class"] = df_train["class"].apply(remove_punctuations)
df_train["cleaned_text"] = df_train.apply(clean_text, axis=1)
df_test["class"] = df_test["class"].apply(remove_punctuations)
df_test["cleaned_text"] = df_test.apply(clean_text, axis=1)
df_train.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit,cleaned_text
0,مونتانا ذره 400 ج,vegetables fruits,مونتانا,400جم,1,,,,كيس,جم,ذره
1,ahmad tea fruit and herb selection herbal teab...,tea coffee hot drinks,ahmad tea,,20,,,,علبة,,fruit herb selection herbal teabags pieces
2,lulu brown samoon 1pkt,bakery,lulu,,1,,,,عبوة,,brown samoon pkt
3,فلفل رومى بلدى حشو وزن,vegetables fruits,,,1,,,,كيس,,فلفل رومى بلدى حشو وزن
4,كانز,soft drinks juices,,,1,,,,كانز,,كانز


In [10]:
df_train[df_train["cleaned_text"]==""]
df_test[df_test["cleaned_text"]==""]

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit,cleaned_text


In [11]:
df_train = df_train[~(df_train["cleaned_text"]=="")]
df_test = df_test[~(df_test["cleaned_text"]=="")]

## Create Label

In [22]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

all_classes = pd.concat([df_train['class'], df_test['class']]).to_frame()

encoder.fit(all_classes)

df_train['label'] = encoder.transform(df_train[['class']])
df_test['label'] = encoder.transform(df_test[['class']])

In [10]:
# df_train["ara_class"] = df_train["class"].apply(lambda x: CLASSES_TRANSLATION[x])
# # df_train["label"] = df_train.apply(lambda x: x["class"] + ", " + x["ara_class"], axis=1)
# df_train["label"] = df_train.apply(lambda x: x["class"], axis=1)
# df_train

# Evaluate Model

In [10]:
n_sample = None

In [None]:

multi_column_embedding_model_evaluation(
    df_train,
    ".." / E5_LARGE_INSTRUCT_CONFIG_PATH,
    [
        # "Item_Name",
        # "removed_stopwords",
        "cleaned_text",
    ],
    n_sample
)

You are evaluating: intfloat/multilingual-e5-large-instruct
Average time taken for a single example: 0.04089269289326888 seconds
Number of examples: 4772
The score of using cleaned_text: 0.4031708132364148


In [None]:
multi_column_embedding_model_evaluation(
    df_train,
    ".." / QWEN3_EMBEDDING_CONFIG_PATH,
    [
        # "Item_Name",
        # "removed_punctuations",
        # "removed_numbers",
        # "removed_numbers_and_punctuations",
        # "removed_brand",
        # "removed_pack",
        # "removed_unit",
        # "removed_stopwords",
        "cleaned_text"
    ],
    n_sample
)

Average time taken for a single example: 0.1553166389465332 seconds
Number of examples: 5
The score of using cleaned_text: 0.4


In [None]:

multi_column_embedding_model_evaluation(
    df_train,
    ".." / PARAPHRASER_EMBEDDING_CONFIG_PATH,
    [
        # "Item_Name", 
        # "removed_punctuations", 
        # "removed_numbers", 
        # "removed_numbers_and_punctuations", 
        # "removed_brand", 
        # "removed_pack", 
        # "removed_unit", 
        "cleaned_text"
    ], 
    n_sample
)

Average time taken for a single example: 0.021748514856896477 seconds
Number of examples: 4772
The score of using cleaned_text: 0.3265305324526368


# Save Cleaned Data

In [26]:
df_train.head()

Unnamed: 0,Item_Name,class,Brand,Weight,Number of units,Size of units,Price,T.Price,Pack,Unit,cleaned_text,label
0,مونتانا ذره 400 ج,vegetables fruits,مونتانا,400جم,1,,,,كيس,جم,ذره,40.0
1,ahmad tea fruit and herb selection herbal teab...,tea coffee hot drinks,ahmad tea,,20,,,,علبة,,fruit herb selection herbal teabags pieces,38.0
2,lulu brown samoon 1pkt,bakery,lulu,,1,,,,عبوة,,brown samoon pkt,1.0
3,فلفل رومى بلدى حشو وزن,vegetables fruits,,,1,,,,كيس,,فلفل رومى بلدى حشو وزن,40.0
4,كانز,soft drinks juices,,,1,,,,كانز,,كانز,33.0


In [27]:
df_train.drop_duplicates(subset=["Item_Name"], inplace=True)
df_train.drop(columns=["Item_Name", "class", "Brand", "Weight", "Number of units", "Size of units", "Price", "T.Price", "Pack", "Unit"], inplace=True)
df_train.dropna(subset=["cleaned_text"])

df_test.drop_duplicates(subset=["Item_Name"], inplace=True)
df_test.drop(columns=["Item_Name", "class", "Brand", "Weight", "Number of units", "Size of units", "Price", "T.Price", "Pack", "Unit"], inplace=True)
df_test.dropna(subset=["cleaned_text"])

df_train

Unnamed: 0,cleaned_text,label
0,ذره,40.0
1,fruit herb selection herbal teabags pieces,38.0
2,brown samoon pkt,1.0
3,فلفل رومى بلدى حشو وزن,40.0
4,كانز,33.0
...,...,...
42939,tomato paste g,39.0
42941,فول مدمس معلب مقشور بالخلطة المصرية جم,39.0
42943,كباب عائلى,7.0
42945,لحم وجه فخذ بقر محلي,3.0


In [28]:
df_train = df_train[df_train["cleaned_text"]!="nan"]
df_test = df_test[df_test["cleaned_text"]!="nan"]

In [29]:
df_train.to_csv(".." / CLEANED_TRAIN_DATA_PATH, index=False, encoding="utf-8-sig")
df_test.to_csv(".." / CLEANED_TEST_DATA_PATH, index=False, encoding="utf-8-sig")