### **Data loading and preprcessing**

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

In [92]:
train_df = pd.read_excel('News_train.xlsx')
test_df = pd.read_excel('News_test.xlsx')

In [93]:
# Load Arabic stop words
arabic_stopwords = set(stopwords.words('arabic'))

def preprocess_text(text):
    # Remove English characters
    text = re.sub(r'[A-Za-z]', '', text)
    # Remove "ال"
    text = re.sub(r'\bال', '', text)
    # Remove Arabic diacritical marks (الحركات)
    diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(diacritics, '', text)
    # Remove punctuation and replace with space
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert همزات / تاء مربوطة
    tokens = [re.sub("[إأٱآا]", "ا", token) for token in tokens]
    tokens = [re.sub("ؤ", "ء", token) for token in tokens]
    tokens = [re.sub("ئ", "ء", token) for token in tokens]
    tokens = [re.sub("ة", "ه", token) for token in tokens]
    # Remove stop words
    tokens = [token for token in tokens if token not in arabic_stopwords]
    # Join tokens back to text
    cleaned_text = ' '.join(tokens)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [94]:
train_df['News'] = train_df['News'].apply(preprocess_text)
test_df['News'] = test_df['News'].apply(preprocess_text)

In [95]:
X_train = train_df['News']
X_test = test_df['News']
y_train = train_df['Type']
y_test = test_df['Type']

In [100]:
from sklearn.preprocessing import LabelEncoder
import torch
# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train = torch.tensor(y_train)

y_test = label_encoder.fit_transform(y_test)
y_test = torch.tensor(y_test)

### **Word2Vec Skip Gram + Logistic Regression**

In [101]:
import gensim
import numpy as np

# Load the pre-trained Word2Vec Skip-Gram model
w2v_model_sg = gensim.models.Word2Vec.load('C:/Users/SaifD/Desktop/NLP-Final/wiki_sg_300/wikipedia_sg_300')

# Initialize lists for storing tokens and embeddings
OOV_tokens_sg = []
train_tokens_sg = []
val_tokens_sg = []

def get_doc_vec_sg(sent, model, data_type):
    w2v_embeddings = []
    tokens = sent.split()
    for word in tokens:
        try:
            if data_type == 'train':
                w2v_embeddings.append(model.wv[word])
                train_tokens_sg.append(word)
            else:
                w2v_embeddings.append(model.wv[word])
                val_tokens_sg.append(word)
        except KeyError:
            OOV_tokens_sg.append(word)
            continue
    if len(w2v_embeddings) == 0:
        return None
    return np.mean(w2v_embeddings, axis=0)

# Generate embeddings for training and validation sets
X_train_w2v_embeddings_sg = X_train.apply(lambda sent: get_doc_vec_sg(sent, w2v_model_sg, 'train'))
X_val_w2v_embeddings_sg = X_test.apply(lambda sent: get_doc_vec_sg(sent, w2v_model_sg, 'test'))

# Initialize lists to store embeddings
X_train_w2v_embeddings_list_sg = []
X_test_w2v_embeddings_list_sg = []

# Convert embeddings from the pandas Series to lists and handle None values
zero_vector = np.zeros(w2v_model_sg.vector_size)

for embedding in X_train_w2v_embeddings_sg:
    if embedding is not None:
        X_train_w2v_embeddings_list_sg.append(embedding)
    else:
        X_train_w2v_embeddings_list_sg.append(zero_vector)

for embedding in X_val_w2v_embeddings_sg:
    if embedding is not None:
        X_test_w2v_embeddings_list_sg.append(embedding)
    else:
        X_test_w2v_embeddings_list_sg.append(zero_vector)

# Convert lists to numpy arrays
X_train_w2v_embeddings_array_sg = np.array(X_train_w2v_embeddings_list_sg)
X_test_w2v_embeddings_array_sg = np.array(X_test_w2v_embeddings_list_sg)

**Word2Vec with Skip Gram with Logistic Regression**

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_w2v_embeddings_array_sg, y_train)

y_pred_lr = LR_classifier.predict(X_test_w2v_embeddings_array_sg)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_test, y_pred_lr))

Accuracy (Logistic Regression): 0.862
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.81      0.72      0.76       200
           1       0.84      0.95      0.89       512
           2       0.95      0.95      0.95       200
           3       0.93      0.45      0.61        88

    accuracy                           0.86      1000
   macro avg       0.88      0.77      0.81      1000
weighted avg       0.87      0.86      0.85      1000



### **BERT Word embedding**

In [103]:
from transformers import BertTokenizer, BertModel
import torch

# Load the tokenizer and model
model_name = "aubmindlab/bert-base-arabert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings of the [CLS] token
    cls_embeddings = outputs.last_hidden_state[:, 0, :] # [batch_size, sequence_length, hidden_size]
    return cls_embeddings.numpy()



In [104]:
X_train_BERT_embeddings = np.vstack(X_train.apply(lambda x: get_bert_embeddings(x)).values)
X_test_BERT_embeddings = np.vstack(X_test.apply(lambda x: get_bert_embeddings(x)).values)

In [105]:
LR_classifier = LogisticRegression(max_iter=1000)
LR_classifier.fit(X_train_BERT_embeddings, y_train)

y_pred_lr = LR_classifier.predict(X_test_BERT_embeddings)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Accuracy (Logistic Regression): {accuracy_lr}')
print('Classification Report (Logistic Regression):')
print(classification_report(y_test, y_pred_lr))

Accuracy (Logistic Regression): 0.849
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       200
           1       0.85      0.92      0.88       512
           2       0.96      0.95      0.96       200
           3       0.91      0.47      0.62        88

    accuracy                           0.85      1000
   macro avg       0.86      0.77      0.80      1000
weighted avg       0.85      0.85      0.84      1000



## **BERT Model**

In [107]:
# Convert labels to torch.long
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

  y_train = torch.tensor(y_train, dtype=torch.long)
  y_test = torch.tensor(y_test, dtype=torch.long)


In [109]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW

 
model_name = 'aubmindlab/bert-base-arabert'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
 
def tokenize(sentences, tokenizer, max_len):
    tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=max_len)
    return tokens['input_ids'], tokens['attention_mask']
 
train_sentences = X_train.tolist()
test_sentences = X_test.tolist()

# Find the maximum sequence length
max_sequence_len = max(len(x.split()) for x in train_sentences + test_sentences)

# Tokenize data
train_input_ids, train_attention_mask = tokenize(train_sentences, tokenizer, max_sequence_len)
test_input_ids, test_attention_mask = tokenize(test_sentences, tokenizer, max_sequence_len)

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, y_train)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, y_test)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
 
optimizer = AdamW(model.parameters(), lr=2e-5)
 
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.2f}')

    model.eval()
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}/{epochs}, Accuracy: {accuracy:.2f}')

# Print classification report
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Training Loss: 0.35
Epoch 1/3, Accuracy: 0.91
Epoch 2/3, Training Loss: 0.15
Epoch 2/3, Accuracy: 0.89
Epoch 3/3, Training Loss: 0.08
Epoch 3/3, Accuracy: 0.88
              precision    recall  f1-score   support

    economic       0.71      0.89      0.79       200
    politics       0.92      0.90      0.91       512
       sport       0.98      0.97      0.98       200
        tech       0.96      0.55      0.70        88

    accuracy                           0.88      1000
   macro avg       0.90      0.83      0.85      1000
weighted avg       0.90      0.88      0.88      1000

