In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Read Data**

In [None]:
import pandas as pd

In [None]:
news_train=pd.read_excel('/content/drive/MyDrive/News_train.xlsx')

In [None]:
news_train.shape

(5000, 2)

# **Preprocessing**

### **re**

In [None]:
!pip install pyarabic



In [None]:
import pyarabic

In [None]:
import re
import unicodedata
from pyarabic.araby import tokenize

def normalize_arabic(text):
    # Normalize Arabic text
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') # remove diacritics
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub('/', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('_', ' ', text)
    text = re.sub(' و ', ' ', text)
    text = re.sub("'", ' ', text)
    text = re.sub("``", ' ', text)
    text = re.sub('"', ' ', text)
    text = re.sub('%', ' ', text)
    text = re.sub('»', ' ', text)
    text = re.sub('«', ' ', text)
    text = re.sub(r'\bال(\w+)\b', r'\1', text)
    text = re.sub(r'\bلل(\w+)\b', r'\1', text)
    text = re.sub(r'\bبال(\w+)\b', r'\1', text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)#remove english characters
    text = re.sub(r'[0-9]', r'', text)#remove numbers
    text = re.sub(r'[^\w\s]', r'', text)#remove punctuation

    words = tokenize(text)

    normalized_text = ' '.join(words)

    return normalized_text

In [None]:
news_train['News'] = news_train['News'].apply(normalize_arabic)

In [None]:
news_train['News']

0       اشتباك حريري عون اتهامات لباسيل تمسك ثلث معطل ...
1                               عون حريري اصبح غريب اطوار
2       وزير خارجيه امريكي ندرس سحب كامل قواتنا من افغ...
3       افغانستان استعدادات حثيثه لاجتماع تركيا وكابل ...
4                   اندبندنت مفاوضات سريه كادت تنقذ قذافي
                              ...                        
4995                اوروبا تبدا احصاء خساير فيضانات مدمره
4996    قتل متظاهر رصاص خلال احتجاجات علي شح مياه في م...
4997    وسايل اعلام ايرانيه تتحدث عن اندلاع احتجاجات ف...
4998           مفاوضات افغانيه تتواصل في عاصمه قطريه دوحه
4999    تعليق مفاوضات افغانيه في دوحه موقتا لمزيد من م...
Name: News, Length: 5000, dtype: object

In [None]:
news_train.shape

(5000, 2)

### **nltk**

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re
from pyarabic.araby import tokenize
from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer

stemmer = ISRIStemmer()

def preprocess_text(text):
    # Tokenization for Arabic text using PyArabic's tokenize function
    tokens = tokenize(text)

    # Get Arabic stop words from NLTK
    stop_words_arabic = set(stopwords.words('arabic'))

    # Remove stop words
    arabic_tokens = [token for token in tokens if token not in stop_words_arabic]

    # Stemming Arabic text
    arabic_tokens = [stemmer.stem(token) for token in arabic_tokens]

    # Join tokens back into text
    processed_text = ' '.join(arabic_tokens)

    return processed_text

In [None]:
preprocessed_documents_news_train = [preprocess_text(doc) for doc in news_train['News']]

In [None]:
len(preprocessed_documents_news_train)

5000

# **Splitting the data into training and validation**

In [None]:
news_train['processed_text'] = preprocessed_documents_news_train

In [None]:
news_train['processed_text']

0                 شبك حرر عون تهم اسل تمس ثلث عطل بقء حكم
1                                    عون حرر اصبح غرب طور
2                     وزر خرج امر درس سحب كمل قوت غنس حلل
3       غنس عدد حثث جمع ترك كبل تهم طلب ركز علي حرب بد...
4                            اندبندنت فاض سره كدت نقذ قذف
                              ...                        
4995                             ورب تبد حصء خسير فيض دمر
4996      قتل ظاهر رصص خلل حجج علي شح ياه طقه جنب غرب اير
4997                          سيل علم يرن حدث دلع حجج شرع
4998                              فاض فغن وصل عصم قطر دوح
4999                          علق فاض فغن دوح وقت زيد شار
Name: processed_text, Length: 5000, dtype: object

In [None]:
news_train.columns

Index(['Type', 'News', 'processed_text'], dtype='object')

In [None]:
y=news_train['Type']
X=news_train.drop(columns=['Type', 'News'], axis=1)

In [None]:
y

0       politics
1       politics
2       politics
3       politics
4       politics
          ...   
4995    politics
4996    politics
4997    politics
4998    politics
4999    politics
Name: Type, Length: 5000, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
len(X_train)

4250

In [None]:
import torch

# **Bert Embeddings and Bert Model**

Code1:

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


In [None]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import numpy as np

label_mapping = {'economic': 0, 'politics': 1, 'sport': 2, 'tech': 3}

num_class = len(set(label_mapping.values()))

# Encode the categorical labels into numerical format using the mapping
y_train_encoded_manual = np.array([label_mapping[label] for label in y_train])
y_test_encoded_manual = np.array([label_mapping[label] for label in y_test])

y_train_one_hot = to_categorical(y_train_encoded_manual, num_classes=num_class)
y_test_one_hot = to_categorical(y_test_encoded_manual, num_classes=num_class)


In [None]:
# freeze bert parameters
for param in model.parameters():
    param.requires_grad = False

class BertClassifier(nn.Module):
    def __init__(self, bert_model):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, 4)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits

In [None]:
X_train_tokens = tokenizer(X_train['processed_text'].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=128)
X_test_tokens = tokenizer(X_test['processed_text'].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=128)

In [None]:
# Convert labels to tensor
y_train_tensor = torch.tensor(y_train_one_hot).to(device)
y_test_tensor = torch.tensor(y_test_one_hot).to(device)

num_labels = 4
classifier_model = BertClassifier(model)
classifier_model = classifier_model.to(device)

optimizer = AdamW(classifier_model.parameters(), lr=2e-5)



In [None]:
# Define batch size & epoch
batch_size = 8
num_epochs=8

In [None]:
# Training loop
for epoch in range(num_epochs):
    classifier_model.train()
    optimizer.zero_grad()


    train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)


        outputs = classifier_model(input_ids=input_ids, attention_mask=attention_mask)


        loss = nn.CrossEntropyLoss()(outputs, labels.argmax(dim=1))


        loss.backward()


        optimizer.step()

    print(f'Epoch {epoch+1}: Loss = {loss.item()}')

Epoch 1: Loss = 1.950777292251587
Epoch 2: Loss = 0.8874014019966125
Epoch 3: Loss = 2.868354558944702
Epoch 4: Loss = 1.3071318864822388
Epoch 5: Loss = 1.0836200714111328
Epoch 6: Loss = 1.0743759870529175
Epoch 7: Loss = 0.8181892037391663
Epoch 8: Loss = 0.34849026799201965


In [None]:
classifier_model.eval()

with torch.no_grad():
    test_outputs = classifier_model(input_ids=X_test_tokens['input_ids'].to(device),
                                   attention_mask=X_test_tokens['attention_mask'].to(device))

test_loss = nn.CrossEntropyLoss()(test_outputs, y_test_tensor.argmax(dim=1))

#accuracy for model
_, predicted_labels = torch.max(test_outputs, 1)
correct = (predicted_labels == y_test_tensor.argmax(dim=1)).sum().item()
total = y_test_tensor.size(0)
accuracy = correct / total
print(f'test Accuracy: {accuracy * 100:.2f}%')


test Accuracy: 65.07%


In [None]:
from sklearn.metrics import classification_report
predicted_labels_cpu = predicted_labels.cpu().numpy()
y_test_cpu = y_test_tensor.argmax(dim=1).cpu().numpy()
report = classification_report(y_test_cpu, predicted_labels_cpu)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       134
           1       0.65      1.00      0.79       488
           2       0.00      0.00      0.00       120
           3       0.00      0.00      0.00         8

    accuracy                           0.65       750
   macro avg       0.16      0.25      0.20       750
weighted avg       0.42      0.65      0.51       750



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Code 2:

In [None]:
import os
import shutil
import tarfile
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



In [None]:

max_len = 128

# Tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(
    X_train['processed_text'].tolist(),  # Convert the specific column to list
    padding=True,
    truncation=True,
    max_length=max_len,
    return_tensors='tf'
)

X_test_encoded = tokenizer.batch_encode_plus(
    X_test['processed_text'].tolist(),  # Convert the specific column to list
    padding=True,
    truncation=True,
    max_length=max_len,
    return_tensors='tf'
)


In [None]:
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install tensorflow==2.11.0 transformers==4.26.0




In [None]:
# Correct imports
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Compile the model with an appropriate optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Define the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels to numerical format
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Step 5: Train the model
history = model.fit(
	[X_train_encoded['input_ids'], X_train_encoded['token_type_ids'], X_train_encoded['attention_mask']],
	y_train_encoded,
	validation_data=(
	[X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']], y_test_encoded),
	batch_size=16, 	epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
from sklearn.metrics import classification_report

# Assuming your model is named 'model'
test_loss, test_accuracy = model.evaluate(
    [X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']],
    y_test_encoded
)

print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

# Get predictions
predictions = model.predict([X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']])



Test loss: 0.39175188541412354, Test accuracy: 0.862666666507721


NameError: name 'np' is not defined