# **Arabic Classification Model**

## Install & Import Necessary Libraries

In [1]:
!pip install gensim tensorflow numpy pandas scikit-learn matplotlib seaborn

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [2]:
import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam

## Load & Explore Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/a1/arabic_reviews.csv')

In [7]:
# Show some rows to ensure dataset is correctly loaded
print(" The Sample Data:")
print(df.head())

 The Sample Data:
      label                                               text
0  Positive  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1  Positive  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...
2  Positive  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...
3  Positive  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...
4  Positive  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...


In [10]:
# Check class balance, eqaul examples from all classes/labels
print("The Class Distribution:")
print(df['label'].value_counts())

The Class Distribution:
label
Positive    33333
Mixed       33333
Negative    33333
Name: count, dtype: int64


## Arabic Text Preprocessing

1. Remove punctuation & non-Arabic characters
2.   Normalize Arabic letters (e.g., convert "أ" → "ا")
3.   Remove stopwords (common words that don’t add meaning)
4.   Tokenization (split into words)

In [12]:
!pip install arabic-reshaper

Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0


In [14]:
!pip install tashaphyne

Collecting tashaphyne
  Downloading Tashaphyne-0.3.6-py3-none-any.whl.metadata (18 kB)
Collecting pyarabic (from tashaphyne)
  Downloading PyArabic-0.6.15-py3-none-any.whl.metadata (10 kB)
Downloading Tashaphyne-0.3.6-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.5/251.5 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarabic, tashaphyne
Successfully installed pyarabic-0.6.15 tashaphyne-0.3.6


In [15]:
import re
import arabic_reshaper
from tashaphyne.stemming import ArabicLightStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
import nltk
nltk.download('punkt_tab') # Download the missing resource

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [28]:
# Arabic stopwords list
stop_words = set(stopwords.words("arabic"))

def clean_text(text):
    # Normalize Arabic letters
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "و", text)
    text = re.sub(r"ئ", "ي", text)
    text = re.sub(r"ة", "ه", text)

    # Remove punctuation and numbers
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)

    # Tokenize and remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    return " ".join(words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Show cleaned samples
print(df[['text', 'clean_text']].head())

                                                text  \
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...   
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...   
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...   
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...   
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...   

                                          clean_text  
0  ممتاز نوعا النظافه والموقع والتجهيز والشاطيء ا...  
1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...  
2  هادفه وقويه تنقلك صخب شوارع القاهره الي هدوء ج...  
3  خلصنا مبدييا اللي مستني ابهار زي الفيل الازرق ...  
4  ياسات جلوريا جزء يتجزا دبي فندق متكامل الخدمات...  


In [36]:
from gensim.models import KeyedVectors

# Update the path to match where you saved AraVec
aravec_path = r"C:\Users\PC\Documents\full_grams_cbow_300_twitter.zip"

# Load the model
aravec = KeyedVectors.load_word2vec_format(aravec_path, binary=True)

print("✅ AraVec model loaded successfully!")

NotImplementedError: Unable to handle scheme 'c', expected one of ('', 'file', 'ftp', 'ftps', 'gs', 'hdfs', 'http', 'https', 'viewfs', 'webhdfs'). Extra dependencies required by 'c' may be missing. See <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst> for details.

In [29]:
from gensim.models import KeyedVectors

# Load AraVec (Ensure correct path)
aravec_path = "your_aravec_model.bin"  # Update with actual path
aravec = KeyedVectors.load_word2vec_format(aravec_path, binary=True)

# Tokenize words and prepare embedding matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

FileNotFoundError: [Errno 2] No such file or directory: 'your_aravec_model.bin'

In [None]:

# Tokenize dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding

# Create embedding matrix
embedding_dim = 300  # AraVec uses 300-dimensional embeddings
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in aravec:
        embedding_matrix[i] = aravec[word]

print(f"✅ Loaded AraVec embeddings for {len(embedding_matrix)} words.")

## Train-Test Split (90% Train, 10% Test)

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Encode labels (Negative = 0, Mixed = 1, Positive = 2)
label_mapping = {'Negative': 0, 'Mixed': 1, 'Positive': 2}
df['label_id'] = df['label'].map(label_mapping)

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label_id'], test_size=0.1, random_state=42, stratify=df['label_id']
)

In [None]:
# Convert text to sequences
max_length = 100  # Adjust as needed
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post')

# Convert labels to categorical (for softmax)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=3)

print("✅ Data successfully prepared for training.")

## Build & Train the Bidirectional LSTM Model

In [None]:
# Define model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True, activation='tanh')),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(3, activation='softmax')  # 3 output classes
])

In [None]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train model
history = model.fit(X_train_seq, y_train, epochs=10, batch_size=64, validation_data=(X_test_seq, y_test))

## Evaluate Model Performance

In [None]:
# Predict on test set
y_pred = np.argmax(model.predict(X_test_seq), axis=1)
y_true = np.argmax(y_test, axis=1)

In [None]:
# Classification Report
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=['Negative', 'Mixed', 'Positive']))

In [None]:
# Confusion Matrix
plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Mixed', 'Positive'], yticklabels=['Negative', 'Mixed', 'Positive'])
plt.title("Confusion Matrix")
plt.show()

## Test Model on New Arabic Sentences

In [None]:
new_sentences = ["الخدمة كانت رائعة جدا", "التجربة كانت سيئة للغاية", "كان الموضوع عادياً تماماً"]
new_sequences = pad_sequences(tokenizer.texts_to_sequences(new_sentences), maxlen=max_length, padding='post')

predictions = np.argmax(model.predict(new_sequences), axis=1)
label_map = {0: "سلبي", 1: "مختلط", 2: "إيجابي"}
print("Predictions:", [label_map[p] for p in predictions])
