## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import scipy
import re
import nltk
from nltk.corpus import stopwords




## Reading the DataSet

In [2]:
df = pd.read_excel("Product Matching Dataset.xlsx" , sheet_name="Dataset")  
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [3]:
test_df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [4]:
df.shape

(83562, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83562 entries, 0 to 83561
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sku                          83562 non-null  int64  
 1   marketplace_product_name_ar  83562 non-null  object 
 2   seller_item_name             83562 non-null  object 
 3   price                        83562 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.6+ MB


In [6]:
df.describe()

Unnamed: 0,sku,price
count,83562.0,83562.0
mean,1600.653204,79.055458
std,1469.206223,62.818117
min,4.0,7.0
25%,476.0,38.0
50%,1312.0,61.5
75%,2258.0,100.5
max,9532.0,406.0


In [7]:
df.isna().sum()

sku                            0
marketplace_product_name_ar    0
seller_item_name               0
price                          0
dtype: int64

In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.columns

Index(['sku', 'marketplace_product_name_ar', 'seller_item_name', 'price'], dtype='object')

In [10]:
train_df, validation_df = train_test_split(df, test_size=0.2, stratify=df['sku'], random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", validation_df.shape)
print("\nTrain DataFrame first rows:\n", train_df.head())


Train DataFrame shape: (66849, 4)
Test DataFrame shape: (16713, 4)

Train DataFrame first rows:
     sku marketplace_product_name_ar              seller_item_name  price
0  1312      موزابرايد 5 مجم 30 قرص            موزابريد5محم 30قرص   91.5
1  4743     ميكروسيرك 16 مجم 20 قرص        ميكروسيرك 16 مجم اقراص   29.0
2  2517  البافيت كالسيوم شراب 60 مل     البافيت 60مل شراب كالسيوم   26.0
3  2374     زيستريل 20 مجم 10 اقراص  زيستريل 20 مجم اقراص س جديد    52.0
4   639           اتور 10 مجم 7 قرص              اتور10 اقراص س ج   33.0


## Import train and test datasets

In [11]:
def arabic_word_preprocessing(text):
    nltk.download('stopwords')
    arabic_stopwords = set(stopwords.words('arabic'))
    X = []
    for i in range(len(text)):
        
        statment = re.sub(r'[إأآ]', 'ا', text[i])  
        statment = re.sub(r'ى', 'ي', statment)  
        statment = re.sub(r'ة', 'ه', statment)  
        statment = re.sub(r'ؤ', 'و', statment)  
        statment = re.sub(r'ئ', 'ي', statment)  
        statment = re.sub(r'ــ', '', statment)
        
        statment = re.sub(r'([\u0600-\u06FF])\1', r'\1', statment)
        
        statment = re.sub(r'(\d+)(?=\D)', r'\1 ', statment)
        statment = re.sub(r'(\D)(\d+)', r'\1 \2', statment)
        
        pattern = '[^\u0621-\u064A0-9a-zA-Z\s]' 
        statment = re.sub(pattern, ' ', statment)
        
        pattern = r"(?<!\d)(.)\1+(?!\d)"
        statment = re.sub(pattern, r"\1", statment)
        
        pattern = r'\b(مل|ملي|مم|جم|مج|مجم|عادي|عاده)\b'
        statment =  re.sub(pattern, '', statment)
        
        pattern = r'\b(اقراص\w*|قرص\w*|شري\w*|كبسول\w*|شرايط|افلام|فيلم|استحلاب|ك|ق)\b'
        statment =  re.sub(pattern, 'قرص', statment)
        
        pattern = r'\bقرص\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'قرص ', statment)        
    
        pattern = r'\bامبول\w*|حقن\w*\b'
        statment = re.sub(pattern, 'امبول', statment)
        
        pattern = r'\bامبول\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'امبول', statment)
        
        pattern = r'جل'
        statment = re.sub(pattern, 'جيل', statment)
        
        pattern = r'جيل.*غسول'
        statment = re.sub(pattern, 'غسول', statment)
        
        pattern = r'دش'
        statment = re.sub(pattern, 'غسول', statment)
        
        pattern100 = r'\bجيل\s+(.+)\b'
        statment = re.sub(pattern100, 'جيل', statment)
        
        pattern100 = r'\bغسول\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern100, 'غسول', statment)
        
        pattern100 = r'\bمحلول\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern100, 'محلول', statment)
    
        pattern = r'(\D)(قطر|قطره)\b'
        statment = re.sub(pattern, r'\1 نقط', statment)
        
        pattern = r'قطر\D+'
        statment = re.sub(pattern, r'نقط', statment)
        
        pattern = r'\b(قطره|قطر|نقط|نقطه|قطرهعين)\b'
        statment =  re.sub(pattern, 'نقط', statment)
        
        pattern = r'\bنقط\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'نقط', statment)
        
        pattern = r'\sفوار|اكياس|اكيااس'
        statment = re.sub(pattern, 'كيس', statment)
        
        pattern = r'\bكيس\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'كيس', statment)
        
        pattern = r'\b(سبراي|بخاخه)\b'
        statment =  re.sub(pattern, 'بخاخ', statment)
        
        pattern = r'\bبخاخ\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'بخاخ', statment)
        
        pattern = r'مرهم|اكريم'
        statment = re.sub(pattern, 'كريم', statment)
        
        pattern = r'\bكريم\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'كريم', statment)
        
        pattern = r'\b\w*استنشاق\w*\b'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'استنشـاق'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'شـراب+|شرب+|شراب+'
        statment = re.sub(pattern, ' شراب', statment)
        
        pattern = r'\bشراب\s+([^0-9\s]+)\s*'
        statment = re.sub(pattern, 'شراب', statment)
        
        pattern = r'\b[قديم]+\b' 
        statment = re.sub(pattern, '', statment)

        pattern = r'\b[جديد]+\b'
        statment = re.sub(pattern,"", statment)
        
        pattern = r'\b[ء-ي]\b'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'\b[سعر]+\b'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'\bسج|سق\b'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'\b\w*سعر\w*\b'
        statment = re.sub(pattern, '', statment)
        
        pattern = r'\b\w*جدي\w*\b'
        statment = re.sub(pattern, '', statment)
        
        words = statment.split()
        filtered_words = [word for word in words if word not in arabic_stopwords]

        X.append(' '.join(filtered_words))
   
    
    return X

  pattern = '[^\u0621-\u064A0-9a-zA-Z\s]'


In [12]:
cv = CountVectorizer()
le = LabelEncoder()

In [13]:
def preparing_train_df(train_df):
    seller_name = train_df['seller_item_name'].apply(str)
    X_train_text = arabic_word_preprocessing(seller_name)
    X_train_text = cv.fit_transform(X_train_text).toarray()

    X_train_price = train_df[['price']].values  
    X_train_price = StandardScaler().fit_transform(X_train_price)  

    X_train_combined = np.hstack([X_train_text, X_train_price])  

    y_train = train_df['sku'].values
    y_train = le.fit_transform(y_train)

    return X_train_combined, y_train


In [14]:
def preparing_validation_df(df):
    seller_name = df['seller_item_name'].apply(str)
    X_test_text = arabic_word_preprocessing(seller_name)
    X_test_text = cv.transform(X_test_text).toarray()  

    X_test_price = df[['price']].values  
    X_test_price = StandardScaler().fit_transform(X_test_price)  

    X_test_combined = np.hstack([X_test_text, X_test_price])  

    y_test = df['sku'].values
    y_test = le.transform(y_test)

    return X_test_combined, y_test


In [15]:
def preparing_test_df(df):
    seller_name = df['seller_item_name'].apply(str)
    X_test_text = arabic_word_preprocessing(seller_name)
    X_test_text = cv.transform(X_test_text).toarray()  

    X_test_price = df[['price']].values  
    X_test_price = StandardScaler().fit_transform(X_test_price)  

    X_test_combined = np.hstack([X_test_text, X_test_price])  


    return X_test_combined


In [16]:
X_train, y_train = preparing_train_df(train_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
X_validat, y_validat  = preparing_validation_df(validation_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
X_test = preparing_test_df(test_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
X_train.shape[0]


66849

In [20]:
import tensorflow as tf
from tensorflow import keras

num_classes = len(set(y_train))  


# Define the model
model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001), input_shape=(X_train.shape[1],)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),

    keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(num_classes, activation='softmax')  
])


optimizer = keras.optimizers.AdamW(learning_rate=0.0005, weight_decay=0.01)


model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)


model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_validat, y_validat), callbacks=[early_stopping, reduce_lr])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.5153 - loss: 3.8312 - val_accuracy: 0.9593 - val_loss: 0.9947 - learning_rate: 5.0000e-04
Epoch 2/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9680 - loss: 0.3380 - val_accuracy: 0.9774 - val_loss: 0.1634 - learning_rate: 5.0000e-04
Epoch 3/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 25ms/step - accuracy: 0.9818 - loss: 0.1723 - val_accuracy: 0.9831 - val_loss: 0.1366 - learning_rate: 5.0000e-04
Epoch 4/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9870 - loss: 0.1272 - val_accuracy: 0.9861 - val_loss: 0.1245 - learning_rate: 5.0000e-04
Epoch 5/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.9901 - loss: 0.1057 - val_accuracy: 0.9873 - val_loss: 0.1177 - learning_rate: 5.0000e-04
Epoch 6/100
[1m523/523[0m [32m━━

<keras.src.callbacks.history.History at 0x16b685700>

In [21]:
train_loss, train_acc = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {train_acc:.4f}")

validation_loss, validation_acc = model.evaluate(X_validat, y_validat)
print(f"Test Accuracy: {validation_acc:.4f}")

[1m2090/2090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9978 - loss: 0.0137
Train Accuracy: 0.9977
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9916 - loss: 0.0616
Test Accuracy: 0.9908


In [22]:
probabilities = model.predict(X_test)

confidence_scores = np.max(probabilities, axis=1)

predicted_indices = np.argmax(probabilities, axis=1)

predicted_labels = le.inverse_transform(predicted_indices)

confidence_threshold = 0.75

test_results = []

for i in range(len(X_test)):
    confidence = confidence_scores[i]
    predicted_class = predicted_labels[i]

    if confidence < confidence_threshold:
        predicted_class = "Unknown"

    test_results.append({
        'Predicted': predicted_class,
        'Confidence': f"{confidence:.2f}"
    })

temp_df = pd.DataFrame(test_results)

temp_df.head()


[1m2090/2090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step


Unnamed: 0,Predicted,Confidence
0,1312,1.0
1,4743,1.0
2,2517,1.0
3,2374,1.0
4,639,1.0


In [23]:
predicted_df = test_df

In [24]:
predicted_df['Predicted'] = temp_df['Predicted'].values
predicted_df['Confidence'] = temp_df['Confidence'].values
predicted_df.head()

Unnamed: 0,seller_item_name,price,Predicted,Confidence
0,موزابرايد 5 مجم ق 3 ش/ويسترن,91.5,1312,1.0
1,ميكروسرك 16 اقراص,29.0,4743,1.0
2,البافيت كالسيوم شراب,26.0,2517,1.0
3,زيستريل 20 مجم اقراص,52.0,2374,1.0
4,اتور 10مجم اقراص,33.0,639,1.0


In [25]:
predicted_df.to_csv('predicted.csv', index=False, encoding='utf-8-sig')