### **Data loading and preprcessing**

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

In [2]:
df = pd.read_excel('News_train.xlsx')

In [3]:
# Load Arabic stop words
arabic_stopwords = set(stopwords.words('arabic'))

def preprocess_text(text):
    # Remove English characters
    text = re.sub(r'[A-Za-z]', '', text)
    # Remove "ال"
    text = re.sub(r'\bال', '', text)
    # Remove Arabic diacritical marks (الحركات)
    diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(diacritics, '', text)
    # Remove punctuation and replace with space
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert همزات / تاء مربوطة
    tokens = [re.sub("[إأٱآا]", "ا", token) for token in tokens]
    tokens = [re.sub("ؤ", "ء", token) for token in tokens]
    tokens = [re.sub("ئ", "ء", token) for token in tokens]
    tokens = [re.sub("ة", "ه", token) for token in tokens]
    # Remove stop words
    tokens = [token for token in tokens if token not in arabic_stopwords]
    # Join tokens back to text
    cleaned_text = ' '.join(tokens)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [4]:
df['News'] = df['News'].apply(preprocess_text)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df['News']
y = df['Type']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)

### **TF-IDF, One-hot + Naive Base, Logistic Regression**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=300)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_val_tfidf = vectorizer.transform(X_val)
X_val_tfidf_df = pd.DataFrame(X_val_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

**TF-IDF with Naive Base**

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

NB_classifier = GaussianNB()
NB_classifier.fit(X_train_tfidf_df, y_train)

y_pred = NB_classifier.predict(X_val_tfidf_df)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_val, y_pred))

Accuracy: 0.6
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.68      0.51       176
           1       0.97      0.52      0.68       647
           2       0.61      0.82      0.70       163
           3       0.07      0.79      0.14        14

    accuracy                           0.60      1000
   macro avg       0.52      0.70      0.51      1000
weighted avg       0.80      0.60      0.65      1000



**TF-IDF with Logistic Regression**

In [8]:
from sklearn.linear_model import LogisticRegression

LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_tfidf_df, y_train)

y_pred_lr = LR_classifier.predict(X_val_tfidf_df)

# Evaluate the model
accuracy_lr = accuracy_score(y_val, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_val, y_pred_lr))


Accuracy (Logistic Regression): 0.857
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.83      0.60      0.70       176
           1       0.85      0.96      0.90       647
           2       0.90      0.81      0.85       163
           3       0.00      0.00      0.00        14

    accuracy                           0.86      1000
   macro avg       0.65      0.59      0.61      1000
weighted avg       0.85      0.86      0.85      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**One-hot encoding**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
one_vec = CountVectorizer(binary=True, max_features=300)

X_train_one_hot_matrix  = one_vec.fit_transform(X_train)
X_train_one_hot_df = pd.DataFrame(X_train_one_hot_matrix.toarray(), columns=one_vec.get_feature_names_out())

X_val_one_hot_matrix  = one_vec.transform(X_val)
X_val_one_hot_df = pd.DataFrame(X_val_one_hot_matrix.toarray(), columns=one_vec.get_feature_names_out())

**One-hot wiht Naive Base**

In [10]:
NB_classifier = GaussianNB()
NB_classifier.fit(X_train_one_hot_df, y_train)

y_pred = NB_classifier.predict(X_val_one_hot_df)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_val, y_pred))

Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.64      0.48       176
           1       0.97      0.48      0.65       647
           2       0.57      0.82      0.67       163
           3       0.07      0.79      0.13        14

    accuracy                           0.57      1000
   macro avg       0.50      0.68      0.48      1000
weighted avg       0.79      0.57      0.61      1000



**One-hot wiht Logistic Regression**

In [11]:
LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_one_hot_df, y_train)

y_pred_lr = LR_classifier.predict(X_val_one_hot_df)

# Evaluate the model
accuracy_lr = accuracy_score(y_val, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_val, y_pred_lr))

Accuracy (Logistic Regression): 0.859
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.82      0.62      0.71       176
           1       0.85      0.95      0.90       647
           2       0.92      0.82      0.87       163
           3       0.00      0.00      0.00        14

    accuracy                           0.86      1000
   macro avg       0.65      0.60      0.62      1000
weighted avg       0.85      0.86      0.85      1000



### **Word2Vec [CBow, Skip Gram] + Naive Base, Logistic Regression**

In [12]:
import gensim

# Load pre-trained Word2Vec model
w2v_model_CBoW = gensim.models.Word2Vec.load('C:/Users/SaifD/Desktop/NLP-Final/wiki_cbow_300/wikipedia_cbow_300')

OOV_tokens_CBoW = []
train_tokens_CBoW = []
val_tokens_CBoW = []

def get_doc_vec_CBoW(sent, model, data_type):
    w2v_embeddings = []
    tokens = sent.split()
    for word in tokens:
        try:
            if data_type == 'train':
                w2v_embeddings.append(model.wv[word])
                train_tokens_CBoW.append(word)
            else:
                w2v_embeddings.append(model.wv[word])
                val_tokens_CBoW.append(word)
        except KeyError:
            OOV_tokens_CBoW.append(word)
            continue
    if len(w2v_embeddings) == 0:
        return None
    return np.mean(w2v_embeddings, axis=0)

# Generate embeddings for training and validation sets
X_train_w2v_embeddings_CBoW = X_train.apply(lambda sent: get_doc_vec_CBoW(sent, w2v_model_CBoW, 'train'))
X_val_w2v_embeddings_CBoW = X_val.apply(lambda sent: get_doc_vec_CBoW(sent, w2v_model_CBoW, 'test'))

# Initialize lists to store embeddings
X_train_w2v_embeddings_list_CBoW = []
X_val_w2v_embeddings_list_CBoW = []

# Convert embeddings from the pandas Series to lists and handle None values
zero_vector = np.zeros(w2v_model_CBoW.vector_size)

for embedding in X_train_w2v_embeddings_CBoW:
    if embedding is not None:
        X_train_w2v_embeddings_list_CBoW.append(embedding)
    else:
        X_train_w2v_embeddings_list_CBoW.append(zero_vector)

for embedding in X_val_w2v_embeddings_CBoW:
    if embedding is not None:
        X_val_w2v_embeddings_list_CBoW.append(embedding)
    else:
        X_val_w2v_embeddings_list_CBoW.append(zero_vector)

# Convert lists to numpy arrays
X_train_w2v_embeddings_array_CBoW = np.array(X_train_w2v_embeddings_list_CBoW)
X_val_w2v_embeddings_array_CBoW = np.array(X_val_w2v_embeddings_list_CBoW)


**Word2Vec CBoW with Naive Base**

In [13]:
NB_classifier = GaussianNB()
NB_classifier.fit(X_train_w2v_embeddings_array_CBoW, y_train)

y_pred = NB_classifier.predict(X_val_w2v_embeddings_array_CBoW)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.71      0.69       176
           1       0.92      0.86      0.89       647
           2       0.95      0.90      0.92       163
           3       0.16      0.64      0.25        14

    accuracy                           0.84      1000
   macro avg       0.68      0.78      0.69      1000
weighted avg       0.87      0.84      0.85      1000



**Word2Vec CBoW with Logistic Regression**

In [14]:
LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_w2v_embeddings_array_CBoW, y_train)

y_pred_lr = LR_classifier.predict(X_val_w2v_embeddings_array_CBoW)

# Evaluate the model
accuracy_lr = accuracy_score(y_val, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_val, y_pred_lr))

Accuracy (Logistic Regression): 0.874
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.77      0.68      0.72       176
           1       0.90      0.93      0.91       647
           2       0.90      0.92      0.91       163
           3       0.36      0.29      0.32        14

    accuracy                           0.87      1000
   macro avg       0.73      0.70      0.72      1000
weighted avg       0.87      0.87      0.87      1000



**Word2Vec with Skip Gram**

In [15]:
import gensim
import numpy as np

# Load the pre-trained Word2Vec Skip-Gram model
w2v_model_sg = gensim.models.Word2Vec.load('C:/Users/SaifD/Desktop/NLP-Final/wiki_sg_300/wikipedia_sg_300')

# Initialize lists for storing tokens and embeddings
OOV_tokens_sg = []
train_tokens_sg = []
val_tokens_sg = []

def get_doc_vec_sg(sent, model, data_type):
    w2v_embeddings = []
    tokens = sent.split()
    for word in tokens:
        try:
            if data_type == 'train':
                w2v_embeddings.append(model.wv[word])
                train_tokens_sg.append(word)
            else:
                w2v_embeddings.append(model.wv[word])
                val_tokens_sg.append(word)
        except KeyError:
            OOV_tokens_sg.append(word)
            continue
    if len(w2v_embeddings) == 0:
        return None
    return np.mean(w2v_embeddings, axis=0)

# Generate embeddings for training and validation sets
X_train_w2v_embeddings_sg = X_train.apply(lambda sent: get_doc_vec_sg(sent, w2v_model_sg, 'train'))
X_val_w2v_embeddings_sg = X_val.apply(lambda sent: get_doc_vec_sg(sent, w2v_model_sg, 'test'))

# Initialize lists to store embeddings
X_train_w2v_embeddings_list_sg = []
X_val_w2v_embeddings_list_sg = []

# Convert embeddings from the pandas Series to lists and handle None values
zero_vector = np.zeros(w2v_model_sg.vector_size)

for embedding in X_train_w2v_embeddings_sg:
    if embedding is not None:
        X_train_w2v_embeddings_list_sg.append(embedding)
    else:
        X_train_w2v_embeddings_list_sg.append(zero_vector)

for embedding in X_val_w2v_embeddings_sg:
    if embedding is not None:
        X_val_w2v_embeddings_list_sg.append(embedding)
    else:
        X_val_w2v_embeddings_list_sg.append(zero_vector)

# Convert lists to numpy arrays
X_train_w2v_embeddings_array_sg = np.array(X_train_w2v_embeddings_list_sg)
X_val_w2v_embeddings_array_sg = np.array(X_val_w2v_embeddings_list_sg)

**Word2Vec with Skip Gram with Naive Base**

In [16]:
NB_classifier = GaussianNB()
NB_classifier.fit(X_train_w2v_embeddings_array_sg, y_train)

y_pred = NB_classifier.predict(X_val_w2v_embeddings_array_sg)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.77      0.75       176
           1       0.93      0.90      0.91       647
           2       0.97      0.91      0.94       163
           3       0.31      0.64      0.42        14

    accuracy                           0.88      1000
   macro avg       0.73      0.81      0.75      1000
weighted avg       0.89      0.88      0.88      1000



**Word2Vec with Skip Gram with Logistic Regression**

In [17]:
LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_w2v_embeddings_array_sg, y_train)

y_pred_lr = LR_classifier.predict(X_val_w2v_embeddings_array_sg)

# Evaluate the model
accuracy_lr = accuracy_score(y_val, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_val, y_pred_lr))

Accuracy (Logistic Regression): 0.917
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       176
           1       0.93      0.96      0.94       647
           2       0.95      0.96      0.95       163
           3       0.62      0.36      0.45        14

    accuracy                           0.92      1000
   macro avg       0.84      0.76      0.79      1000
weighted avg       0.91      0.92      0.91      1000



### **BERT Word embedding**

In [18]:
from transformers import BertTokenizer, BertModel
import torch

# Load the tokenizer and model
model_name = "aubmindlab/bert-base-arabert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings of the [CLS] token
    cls_embeddings = outputs.last_hidden_state[:, 0, :] # [batch_size, sequence_length, hidden_size]
    return cls_embeddings.numpy()



In [19]:
# Get BERT embeddings for train and val data
X_train_BERT_embeddings = np.vstack(X_train.apply(lambda x: get_bert_embeddings(x)).values)
X_val_BERT_embeddings = np.vstack(X_val.apply(lambda x: get_bert_embeddings(x)).values)

**BERT with Naive Base**

In [20]:
NB_classifier = GaussianNB()
NB_classifier.fit(X_train_BERT_embeddings, y_train)

y_pred = NB_classifier.predict(X_val_BERT_embeddings)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.73      0.66       176
           1       0.91      0.85      0.88       647
           2       0.97      0.87      0.92       163
           3       0.34      0.79      0.48        14

    accuracy                           0.83      1000
   macro avg       0.71      0.81      0.73      1000
weighted avg       0.86      0.83      0.84      1000



**BERT with Logistic Regression**

In [21]:
LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_BERT_embeddings, y_train)

y_pred_lr = LR_classifier.predict(X_val_BERT_embeddings)

# Evaluate the model
accuracy_lr = accuracy_score(y_val, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_val, y_pred_lr))

Accuracy (Logistic Regression): 0.918
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       176
           1       0.94      0.95      0.94       647
           2       0.95      0.96      0.95       163
           3       0.55      0.43      0.48        14

    accuracy                           0.92      1000
   macro avg       0.82      0.79      0.80      1000
weighted avg       0.92      0.92      0.92      1000



## **Models**

In [22]:
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [23]:
vocab_size = 4000  
tokenizer = Tokenizer(num_words=vocab_size) 
tokenizer.fit_on_texts(df['News'])
seq = tokenizer.texts_to_sequences(df['News'])

max_length = 100  
pad_seq = pad_sequences(seq, maxlen=max_length, padding='pre')


label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(df['Type'])
y_labels = tf.keras.utils.to_categorical(y_labels)  


pad_seq_X_train, pad_seq_X_test, y_train, y_test = train_test_split(pad_seq, y_labels, test_size=0.2, random_state=42)


**LSTM + TF-IDF**

In [24]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=max_length, weights=[X_train_tfidf_df], trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dense(4, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(pad_seq_X_train, y_train, epochs=10, verbose=1, validation_data=(pad_seq_X_test, y_test))
print('-------------------------------------------------------------------------------------------------------------')
val_loss, val_accuracy = model.evaluate(pad_seq_X_test, y_test)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

y_pred = model.predict(pad_seq_X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print(report)

Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.5907 - loss: 1.0822 - val_accuracy: 0.6620 - val_loss: 0.8252
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.6831 - loss: 0.8069 - val_accuracy: 0.7440 - val_loss: 0.7035
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7588 - loss: 0.6502 - val_accuracy: 0.7610 - val_loss: 0.6433
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7981 - loss: 0.5563 - val_accuracy: 0.7800 - val_loss: 0.6252
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8072 - loss: 0.5135 - val_accuracy: 0.7770 - val_loss: 0.6309
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8272 - loss: 0.4897 - val_accuracy: 0.7820 - val_loss: 0.6033
Epoch 7/10
[1m125/125[0m [32m━

**LSTM + One-hot**

In [25]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=max_length, weights=[X_train_one_hot_df], trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dense(4, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(pad_seq_X_train, y_train, epochs=10, verbose=1, validation_data=(pad_seq_X_test, y_test))
print('-------------------------------------------------------------------------------------------------------------')
val_loss, val_accuracy = model.evaluate(pad_seq_X_test, y_test)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

y_pred = model.predict(pad_seq_X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print(report)

Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.6052 - loss: 1.0317 - val_accuracy: 0.7220 - val_loss: 0.7298
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7351 - loss: 0.6921 - val_accuracy: 0.7700 - val_loss: 0.6257
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.7890 - loss: 0.5702 - val_accuracy: 0.7770 - val_loss: 0.6080
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.8168 - loss: 0.4996 - val_accuracy: 0.7890 - val_loss: 0.5907
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8336 - loss: 0.4505 - val_accuracy: 0.7850 - val_loss: 0.5936
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.8526 - loss: 0.3946 - val_accuracy: 0.7910 - val_loss: 0.6379
Epoch 7/10
[1m125/125[0m [32m━

**LSTM + word2Vec CBoW**

In [26]:
convert_to_nparray = np.concatenate(X_train_w2v_embeddings_CBoW)
X_train_w2v_embeddings_CBoW_nparray = convert_to_nparray.reshape(4000,300)

In [27]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=max_length, weights=[X_train_w2v_embeddings_CBoW_nparray], trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dense(4, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(pad_seq_X_train, y_train, epochs=10, verbose=1, validation_data=(pad_seq_X_test, y_test))
print('-------------------------------------------------------------------------------------------------------------')
val_loss, val_accuracy = model.evaluate(pad_seq_X_test, y_test)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

y_pred = model.predict(pad_seq_X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print(report)

Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.6131 - loss: 1.0101 - val_accuracy: 0.6900 - val_loss: 0.7918
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7406 - loss: 0.6944 - val_accuracy: 0.7740 - val_loss: 0.6111
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.8063 - loss: 0.5228 - val_accuracy: 0.8140 - val_loss: 0.5293
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.8673 - loss: 0.3705 - val_accuracy: 0.8370 - val_loss: 0.4797
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8980 - loss: 0.2929 - val_accuracy: 0.8450 - val_loss: 0.4996
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9302 - loss: 0.1991 - val_accuracy: 0.8550 - val_loss: 0.4608
Epoch 7/10
[1m125/125[0m [32m━

**LSTM + word2Vec Skip Gram**

In [28]:
convert_to_nparray = np.concatenate(X_train_w2v_embeddings_sg)
X_train_w2v_embeddings_sg_nparray = convert_to_nparray.reshape(4000,300)
# X_train_w2v_embeddings_sg_nparray

In [29]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=max_length, weights=[X_train_w2v_embeddings_sg_nparray], trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dense(4, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(pad_seq_X_train, y_train, epochs=10, verbose=1, validation_data=(pad_seq_X_test, y_test))
print('-------------------------------------------------------------------------------------------------------------')
val_loss, val_accuracy = model.evaluate(pad_seq_X_test, y_test)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

y_pred = model.predict(pad_seq_X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print(report)

Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.6000 - loss: 1.0376 - val_accuracy: 0.6760 - val_loss: 0.9152
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.6541 - loss: 0.8948 - val_accuracy: 0.6910 - val_loss: 0.7774
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7001 - loss: 0.7900 - val_accuracy: 0.7380 - val_loss: 0.7130
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7233 - loss: 0.7110 - val_accuracy: 0.7640 - val_loss: 0.6499
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.7606 - loss: 0.6389 - val_accuracy: 0.7070 - val_loss: 0.7645
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.7498 - loss: 0.6458 - val_accuracy: 0.7710 - val_loss: 0.6335
Epoch 7/10
[1m125/125[0m [32m━

**LSTM + BERT**

In [30]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=768, input_length=max_length, weights=[X_train_BERT_embeddings], trainable=False))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dense(4, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(pad_seq_X_train, y_train, epochs=10, verbose=1, validation_data=(pad_seq_X_test, y_test))
print('-------------------------------------------------------------------------------------------------------------')
val_loss, val_accuracy = model.evaluate(pad_seq_X_test, y_test)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

y_pred = model.predict(pad_seq_X_test)

y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print(report)

Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 50ms/step - accuracy: 0.6073 - loss: 1.0645 - val_accuracy: 0.6450 - val_loss: 0.9031
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.6306 - loss: 0.9182 - val_accuracy: 0.7090 - val_loss: 0.7948
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.7063 - loss: 0.7657 - val_accuracy: 0.7450 - val_loss: 0.6333
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.7789 - loss: 0.5992 - val_accuracy: 0.7730 - val_loss: 0.5772
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.8301 - loss: 0.4827 - val_accuracy: 0.8180 - val_loss: 0.4924
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 49ms/step - accuracy: 0.8491 - loss: 0.4037 - val_accuracy: 0.8110 - val_loss: 0.4992
Epoch 7/10
[1m125/125[0m [32m━

## **BERT Model**

In [31]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW

 
model_name = 'aubmindlab/bert-base-arabert'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) # specifically designed for sequence classification tasks
# It includes a classification head on top of the pre-trained BERT model.

def tokenize(sentences, tokenizer, max_len):
    tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=max_len)
    return tokens['input_ids'], tokens['attention_mask']
 
train_sentences = X_train.tolist()
test_sentences = X_val.tolist()

# Find the maximum sequence length
max_sequence_len = max(len(x.split()) for x in train_sentences + test_sentences)

# Tokenize data
train_input_ids, train_attention_mask = tokenize(train_sentences, tokenizer, max_sequence_len)
test_input_ids, test_attention_mask = tokenize(test_sentences, tokenizer, max_sequence_len)

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, torch.tensor(y_train))
test_dataset = TensorDataset(test_input_ids, test_attention_mask, torch.tensor(y_val))

# Create DataLoaders
# Use it to load data in batches
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
 
optimizer = AdamW(model.parameters(), lr=2e-5) # Adma with Weight Decay - provide better regularization
 
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0 #-----------------
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        # Perform forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item() #-----------------
        loss.backward()
        # Update model parameters
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.2f}')
 
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            # Get predictions by finding the index of the maximum logit
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
 
    accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}/{epochs}, Accuracy: {accuracy:.2f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Training Loss: 0.19
Epoch 1/3, Accuracy: 0.93
Epoch 2/3, Training Loss: 0.08
Epoch 2/3, Accuracy: 0.94
Epoch 3/3, Training Loss: 0.05
Epoch 3/3, Accuracy: 0.95


In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

def evaluate_model(model, test_loader, device, label_encoder):
    model.eval()
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
    
    class_names = label_encoder.inverse_transform([0, 1, 2, 3])
    report = classification_report(all_labels, all_predictions, target_names=class_names)
    print(report)

# Call the function after training
evaluate_model(model, test_loader, device, label_encoder)


              precision    recall  f1-score   support

    economic       0.87      0.90      0.89       176
    politics       0.97      0.96      0.96       647
       sport       0.96      0.98      0.97       163
        tech       0.83      0.71      0.77        14

    accuracy                           0.95      1000
   macro avg       0.91      0.89      0.90      1000
weighted avg       0.95      0.95      0.95      1000

