## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import scipy
import re
import nltk
from nltk.corpus import stopwords




## Reading the DataSet

#### Input the train DataSet that contains at least : seller_item_name , price and sku columns

In [2]:
# replace the filename with the name of the file you want to train
df = pd.read_excel("Product Matching Dataset.xlsx" , sheet_name="Dataset")  
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


#### Input the test DataSet that contains at least : seller_item_name column & price 

In [3]:
# replace the filename with the name of the file you want to test
test_df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [4]:
df.shape

(83562, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83562 entries, 0 to 83561
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sku                          83562 non-null  int64  
 1   marketplace_product_name_ar  83562 non-null  object 
 2   seller_item_name             83562 non-null  object 
 3   price                        83562 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.6+ MB


In [6]:
df.describe()

Unnamed: 0,sku,price
count,83562.0,83562.0
mean,1600.653204,79.055458
std,1469.206223,62.818117
min,4.0,7.0
25%,476.0,38.0
50%,1312.0,61.5
75%,2258.0,100.5
max,9532.0,406.0


In [7]:
df.isna().sum()

sku                            0
marketplace_product_name_ar    0
seller_item_name               0
price                          0
dtype: int64

#### shuffling the Taining Dataset to reduce overfitting

In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.columns

Index(['sku', 'marketplace_product_name_ar', 'seller_item_name', 'price'], dtype='object')

In [10]:
train_df, validation_df = train_test_split(df, test_size=0.2, stratify=df['sku'], random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", validation_df.shape)
print("\nTrain DataFrame first rows:\n", train_df.head())


Train DataFrame shape: (66849, 4)
Test DataFrame shape: (16713, 4)

Train DataFrame first rows:
     sku marketplace_product_name_ar        seller_item_name  price
0  1312      موزابرايد 5 مجم 30 قرص  موزابرايد 5 مجم 30 قرص   91.5
1  4743     ميكروسيرك 16 مجم 20 قرص   ميكروسيرك 16مجم اقراص   29.0
2  2517  البافيت كالسيوم شراب 60 مل    البافيت كالسيوم شراب   26.0
3  2374     زيستريل 20 مجم 10 اقراص     زيستريل 20مجم اقراص   52.0
4   639           اتور 10 مجم 7 قرص    اتور 10مجم جدييييييد   33.0


### preprocessing text Data 

In [11]:
def word_preprocessing(text):
    nltk.download('stopwords')
    arabic_stopwords = set(stopwords.words('arabic'))
    english_stopwords = set(stopwords.words('english'))
    X = []
    for i in range(len(text)):
        statement = text[i].lower()
        
        statement = re.sub(r'[إأآ]', 'ا', statement)  
        statement = re.sub(r'ى', 'ي', statement)  
        statement = re.sub(r'ة', 'ه', statement)  
        statement = re.sub(r'ؤ', 'و', statement)  
        statement = re.sub(r'ئ', 'ي', statement)  
        statement = re.sub(r'ــ', '', statement)
        
        statement = re.sub(r'([\u0600-\u06FF])\1', r'\1', statement)
        
        statement = re.sub(r'(\d+)(?=\D)', r'\1 ', statement)
        statement = re.sub(r'(\D)(\d+)', r'\1 \2', statement)
        
        pattern = '[^\u0621-\u064A0-9a-zA-Z\s]' 
        statement = re.sub(pattern, ' ', statement)
        
        pattern = r"(?<!\d)(.)\1+(?!\d)"
        statement = re.sub(pattern, r"\1", statement)
        
        pattern = r'\b(مل|ملي|مم|جم|مج|مجم|عادي|عاده)\b'
        statement =  re.sub(pattern, '', statement)
        
        
        
        
        pattern = r'\b(اقراص\w*|قرص\w*|شري\w*|كبسول\w*|شرايط|افلام|فيلم|استحلاب|ك|ق)\b'
        statement =  re.sub(pattern, 'قرص', statement)
        
        pattern = r'\bقرص\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'قرص ', statement)        
    
        pattern = r'\bامبول\w*|حقن\w*\b'
        statement = re.sub(pattern, 'امبول', statement)
        
        pattern = r'\bامبول\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'امبول', statement)
        
        pattern = r'جل'
        statement = re.sub(pattern, 'جيل', statement)
        
        pattern = r'\bجيل\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'جيل', statement)
        
        pattern = r'جيل.*غسول'
        statement = re.sub(pattern, 'غسول', statement)
        
        pattern = r'\bغسول\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'غسول', statement)
        
        pattern = r'\bمحلول\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'محلول', statement)
        
        pattern = r'\bلبوس\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'لبوس', statement)
        
        
        pattern = r'قطر\D+'
        statement = re.sub(pattern, r'نقط', statement)
        
        pattern = r'\b(قطره|قطر|نقط|نقطه|قطرهعين)\b'
        statement =  re.sub(pattern, 'نقط', statement)
        
        pattern = r'\bنقط\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'نقط', statement)
        
        pattern = r'\sاكياس|اكيااس'
        statement = re.sub(pattern, 'كيس', statement)
        
        pattern = r'\bكيس\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'كيس', statement)
        
        pattern = r'\b(سبراي|بخاخه)\b'
        statement =  re.sub(pattern, 'بخاخ', statement)
        
        pattern = r'\bبخاخ\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'بخاخ', statement)
        
        pattern = r'مرهم|اكريم'
        statement = re.sub(pattern, 'كريم', statement)
        
        pattern = r'\bكريم\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'كريم', statement)
        
        pattern = r'\b\w*استنشاق\w*\b'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'استنشـاق'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'شـراب+|شرب+|شراب+'
        statement = re.sub(pattern, ' شراب', statement)
        
        pattern = r'\bشراب\s+([^0-9\s]+)\s*'
        statement = re.sub(pattern, 'شراب', statement)
        
        
        
        
        
        
        
        pattern = r'\b[قديم]+\b' 
        statement = re.sub(pattern, '', statement)

        pattern = r'\b[جديد]+\b'
        statement = re.sub(pattern,"", statement)
        
        pattern = r'\b[ء-ي]\b'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'\b[سعر]+\b'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'\bسج|سق\b'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'\b\w*سعر\w*\b'
        statement = re.sub(pattern, '', statement)
        
        pattern = r'\b\w*جدي\w*\b'
        statement = re.sub(pattern, '', statement)
        
        statement = re.sub('mg', '', statement)
        
        words = statement.split()
        filtered_words = [word for word in words if word not in arabic_stopwords and word not in english_stopwords]

        X.append(' '.join(filtered_words))
   
    
    return X

  pattern = '[^\u0621-\u064A0-9a-zA-Z\s]'


In [12]:
cv = CountVectorizer()
le = LabelEncoder()

In [13]:
def preparing_train_df(train_df):
    seller_name = train_df['seller_item_name'].apply(str)
    X_train_text = word_preprocessing(seller_name)
    X_train_text = cv.fit_transform(X_train_text).toarray()

    X_train_price = train_df[['price']].values  
    X_train_price = StandardScaler().fit_transform(X_train_price)  

    X_train_combined = np.hstack([X_train_text, X_train_price])  

    y_train = train_df['sku'].values
    y_train = le.fit_transform(y_train)

    return X_train_combined, y_train


In [14]:
def preparing_validation_df(df):
    seller_name = df['seller_item_name'].apply(str)
    X_test_text = word_preprocessing(seller_name)
    X_test_text = cv.transform(X_test_text).toarray()  

    X_test_price = df[['price']].values  
    X_test_price = StandardScaler().fit_transform(X_test_price)  

    X_test_combined = np.hstack([X_test_text, X_test_price])  

    y_test = df['sku'].values
    y_test = le.transform(y_test)

    return X_test_combined, y_test


In [15]:
def preparing_test_df(df):
    seller_name = df['seller_item_name'].apply(str)
    X_test_text = word_preprocessing(seller_name)
    X_test_text = cv.transform(X_test_text).toarray()  

    X_test_price = df[['price']].values  
    X_test_price = StandardScaler().fit_transform(X_test_price)  

    X_test_combined = np.hstack([X_test_text, X_test_price])  


    return X_test_combined


In [16]:
X_train, y_train = preparing_train_df(train_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
X_validat, y_validat  = preparing_validation_df(validation_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
X_test = preparing_test_df(test_df)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
X_train.shape[0]


66849

In [20]:
import tensorflow as tf
from tensorflow import keras

num_classes = len(set(y_train))  


# Define the model
model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001), input_shape=(X_train.shape[1],)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.25),

    keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(num_classes, activation='softmax')  
])


optimizer = keras.optimizers.AdamW(learning_rate=0.0005, weight_decay=0.01)


model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)


model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_validat, y_validat), callbacks=[early_stopping, reduce_lr])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.5197 - loss: 3.8148 - val_accuracy: 0.9587 - val_loss: 1.0131 - learning_rate: 5.0000e-04
Epoch 2/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.9682 - loss: 0.3400 - val_accuracy: 0.9773 - val_loss: 0.1610 - learning_rate: 5.0000e-04
Epoch 3/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 27ms/step - accuracy: 0.9820 - loss: 0.1713 - val_accuracy: 0.9817 - val_loss: 0.1360 - learning_rate: 5.0000e-04
Epoch 4/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - accuracy: 0.9868 - loss: 0.1277 - val_accuracy: 0.9837 - val_loss: 0.1310 - learning_rate: 5.0000e-04
Epoch 5/100
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 25ms/step - accuracy: 0.9888 - loss: 0.1086 - val_accuracy: 0.9846 - val_loss: 0.1186 - learning_rate: 5.0000e-04
Epoch 6/100
[1m523/523[0m [32m━━

<keras.src.callbacks.history.History at 0x3014d1400>

In [21]:
train_loss, train_acc = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {train_acc:.4f}")

validation_loss, validation_acc = model.evaluate(X_validat, y_validat)
print(f"validate Accuracy: {validation_acc:.4f}")

[1m2090/2090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9981 - loss: 0.0117
Train Accuracy: 0.9979
[1m523/523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9921 - loss: 0.0515
validate Accuracy: 0.9918


In [22]:
probabilities = model.predict(X_test)

confidence_scores = np.max(probabilities, axis=1)

predicted_indices = np.argmax(probabilities, axis=1)

predicted_labels = le.inverse_transform(predicted_indices)

confidence_threshold = 0.85

test_results = []

for i in range(len(X_test)):
    confidence = confidence_scores[i]
    predicted_class = predicted_labels[i]

    if confidence < confidence_threshold:
        predicted_class = "Unknown"

    test_results.append({
        'Predicted': predicted_class,
        'Confidence': f"{confidence:.2f}"
    })

temp_df = pd.DataFrame(test_results)

temp_df.head()


[1m2090/2090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step


Unnamed: 0,Predicted,Confidence
0,1312,1.0
1,4743,1.0
2,2517,1.0
3,2374,1.0
4,639,1.0


In [23]:
predicted_df = test_df

In [24]:
predicted_df['Predicted'] = temp_df['Predicted'].values
predicted_df['Confidence'] = temp_df['Confidence'].values
predicted_df.head()

Unnamed: 0,seller_item_name,price,Predicted,Confidence
0,موزابرايد 5 مجم ق 3 ش/ويسترن,91.5,1312,1.0
1,ميكروسرك 16 اقراص,29.0,4743,1.0
2,البافيت كالسيوم شراب,26.0,2517,1.0
3,زيستريل 20 مجم اقراص,52.0,2374,1.0
4,اتور 10مجم اقراص,33.0,639,1.0


In [25]:
predicted_df.to_csv('predicted.csv', index=False, encoding='utf-8-sig')