In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Read Data**

In [None]:
import pandas as pd

In [None]:
news_train=pd.read_excel('/content/drive/MyDrive/News_train.xlsx')

In [None]:
news_train.shape

(5000, 2)

In [None]:
news_test=pd.read_excel('/content/drive/MyDrive/News_test.xlsx')

In [None]:
news_test.shape

(1000, 2)

# **Preprocessing**

### **re**

In [None]:
import re
import nltk
import unicodedata
from nltk import word_tokenize
from nltk import download

nltk.download('punkt')
download('stopwords')

def normalize_arabic(text):
    words = text.split()

    # Normalize Arabic text
    text = ' '.join(words)
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') # remove diacritics
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub('/', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('_', ' ', text)
    text = re.sub(' و ', ' ', text)
    text = re.sub("'", ' ', text)
    text = re.sub("``", ' ', text)
    text = re.sub('"', ' ', text)
    text = re.sub('%', ' ', text)
    text = re.sub('»', ' ', text)
    text = re.sub('«', ' ', text)
    text = re.sub(r'\bال(\w+)\b', r'\1', text)
    text = re.sub(r'\bلل(\w+)\b', r'\1', text)
    text = re.sub(r'\bبال(\w+)\b', r'\1', text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)#remove english characters
    text = re.sub(r'[0-9]', r'', text)#remove numbers
    text = re.sub(r'[^\w\s]', r'', text)#remove punctuation

    #text = re.sub(r'\bوال(\w+)\b', r'\1', text)

    words = word_tokenize(text)
    normalized_text = ' '.join(words)

    return normalized_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
news_train['News'] = news_train['News'].apply(normalize_arabic)

In [None]:
news_train['News']

0       اشتباك حريري عون اتهامات لباسيل تمسك ثلث معطل ...
1                               عون حريري اصبح غريب اطوار
2       وزير خارجيه امريكي ندرس سحب كامل قواتنا من افغ...
3       افغانستان استعدادات حثيثه لاجتماع تركيا وكابل ...
4                   اندبندنت مفاوضات سريه كادت تنقذ قذافي
                              ...                        
4995                اوروبا تبدا احصاء خساير فيضانات مدمره
4996    قتل متظاهر رصاص خلال احتجاجات علي شح مياه في م...
4997    وسايل اعلام ايرانيه تتحدث عن اندلاع احتجاجات ف...
4998           مفاوضات افغانيه تتواصل في عاصمه قطريه دوحه
4999    تعليق مفاوضات افغانيه في دوحه موقتا لمزيد من م...
Name: News, Length: 5000, dtype: object

In [None]:
news_train.shape

(5000, 2)

In [None]:
news_test['News'] = news_test['News'].apply(normalize_arabic)

In [None]:
news_test['News']

0                            ميات قتلي في فيضانات اوروبا
1        جوله مفاوضات جديده تفشل في كسر جمود ازمه قبرصيه
2      قضاء فرنسي يفتح تحقيقا حول بيغاسوس بشان تجسس ع...
3        وزير دفاع اسراييلي يزور فرنسا لبحث قضيه بيغاسوس
4      بدا في عده دول سريان حظر شامل حد من انتشار فير...
                             ...                        
995    هواوي عقوبات اميركيه علي شركه مسووله جزييا عن ...
996          مجهر من هواتف محموله قديمه مدارس في تايلاند
997             هواتف ذكيه شبيهه ليجو تشق طريقها ي اسواق
998    بعد هواوي واشنطن تضيف شركه شياومي صينيه ي قايم...
999                      شياومي خارج قايمه سوداء اميركيه
Name: News, Length: 1000, dtype: object

In [None]:
news_test.shape

(1000, 2)

### **nltk**

In [None]:
# Text Mining
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('arabic')


# Your list of Arabic stop words
additional_arabic_stopwords = ["تم", 'اي', 'لكن', 'أو', 'في', 'و','بدون', 'من', 'علي', 'إلي', 'التي']


def preprocess_text(text):
    # Tokenization for Arabic text using NLTK word_tokenize
    tokens = word_tokenize(text)

    # Add additional Arabic stop words here
    stop_words_arabic = set(stopwords.words('arabic') + additional_arabic_stopwords)
    arabic_tokens = [token for token in tokens if token not in stop_words_arabic]

    # stemming in Arabic text
    arabic_tokens = [stemmer.stem(token) for token in arabic_tokens]

    # Remove punctuation for Arabic text
    arabic_tokens = [token for token in arabic_tokens if re.match(r'\w', token)]

    # Join tokens back into text
    processed_text = ' '.join(arabic_tokens)

    return processed_text


In [None]:
preprocessed_documents_news_train = [preprocess_text(doc) for doc in news_train['News']]

In [None]:
len(preprocessed_documents_news_train)

5000

In [None]:
preprocessed_documents_news_test = [preprocess_text(doc) for doc in news_test['News']]

In [None]:
len(preprocessed_documents_news_test)

1000

# **Splitting the data into training and validation**

In [None]:
news_train['processed_text'] = preprocessed_documents_news_train

In [None]:
news_train['processed_text']

0          اشتب حرير عون اتهام باسيل تمس ثلث معطل قاء حكم
1                                عون حرير اصبح غريب اطوار
2             زير خارج امر ندرس سحب كامل قوا افغانست حلول
3       افغانست استعداد حثيث اجتماع ترك كابل تتهم طالب...
4                          اندبندن مفاوض سر كاد تنقذ قذاف
                              ...                        
4995                      اوروب تبد احصاء خساير يضان مدمر
4996    قتل متظاهر رصاص خلال احتجاج شح ميا منطق جنوب غ...
4997           سايل اعلام ايران تتحدث اندلاع احتجاج شوارع
4998                      مفاوض افغان تتواصل عاصم قطر دوح
4999                 تعليق مفاوض افغان دوح موق مزيد مشاور
Name: processed_text, Length: 5000, dtype: object

In [None]:
news_train.columns

Index(['Type', 'News', 'processed_text'], dtype='object')

In [None]:
y=news_train['Type']
X=news_train.drop(columns=['Type', 'News'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
len(X_train)

4250

# **Word Embedding**

### **TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=300)
X_train_tfidf = vectorizer.fit_transform(X_train['processed_text'])
X_test_tfidf = vectorizer.fit_transform(X_test['processed_text'])


tfidf_train_feature = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_test_feature = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
for word in tfidf_train_feature.columns:
    embedding = tfidf_train_feature[word].values
    print("Word:", word, ", vector:", embedding)
#print(embedding)

Word: ابطال , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اتحاد , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اتفاق , vector: [0.         0.         0.         ... 0.         0.38085582 0.        ]
Word: اتهام , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اثيوب , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اجتماع , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اجنب , vector: [0. 0. 0. ... 0. 0. 0.]
Word: احتجاج , vector: [0. 0. 0. ... 0. 0. 0.]
Word: احتلال , vector: [0. 0. 0. ... 0. 0. 0.]
Word: احد , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اخر , vector: [0. 0. 0. ... 0. 0. 0.]
Word: ادار , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اراض , vector: [0. 0. 0. ... 0. 0. 0.]
Word: ارتفاع , vector: [0. 0. 0. ... 0. 0. 0.]
Word: ارد , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اردن , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اريس , vector: [0. 0. 0. ... 0. 0. 0.]
Word: ازم , vector: [0. 0. 0. ... 0. 0. 0.]
Word: استثمار , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اسراييل , vector: [0. 0. 0. ... 0. 0. 0.]
Word: اسعار , vector: [0. 0. 0. ... 0

### **Bag Of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = cv_uni_vec = CountVectorizer(max_features=300,
                                 stop_words=None,
                                 ngram_range=(2,2))
X_train_uni = cv_uni_vec.fit_transform(X_train['processed_text'])
X_test_uni = cv_uni_vec.fit_transform(X_test['processed_text'])

BOW_train_feature = pd.DataFrame(X_train_uni.toarray(), columns=cv_uni_vec.get_feature_names_out())
BOW_test_feature = pd.DataFrame(X_test_uni.toarray(), columns=cv_uni_vec.get_feature_names_out())

In [None]:
for word in BOW_train_feature.columns:
  embedding = BOW_train_feature[word].values
  print("Word:", word, ", vector:", embedding)

Word: ابطال افريق , vector: [0 0 0 ... 0 0 0]
Word: ابطال اوروب , vector: [0 0 0 ... 0 0 0]
Word: اتحاد اوروب , vector: [0 0 0 ... 0 0 0]
Word: اتحاد طيرا , vector: [0 0 0 ... 0 0 0]
Word: اتفاق تجار , vector: [0 0 0 ... 0 0 0]
Word: اتفاق سلام , vector: [0 0 0 ... 0 0 0]
Word: اتفاق نوو , vector: [0 0 0 ... 0 0 0]
Word: اثناء مبار , vector: [0 0 0 ... 0 0 0]
Word: احتلال اسراييل , vector: [0 0 0 ... 0 0 0]
Word: احتلال نفذ , vector: [0 0 0 ... 0 0 0]
Word: اداء امر , vector: [0 0 0 ... 0 0 0]
Word: ادار ايد , vector: [0 0 0 ... 0 0 0]
Word: ادريس ديب , vector: [0 0 0 ... 0 0 0]
Word: ادن اجور , vector: [0 0 0 ... 0 0 0]
Word: ارتفاع اسعار , vector: [0 0 0 ... 0 0 0]
Word: اريس سان , vector: [0 0 0 ... 0 0 0]
Word: ازم سد , vector: [0 0 0 ... 0 0 0]
Word: ازم كور , vector: [0 0 0 ... 0 0 0]
Word: استشهاد سطين , vector: [0 0 0 ... 0 0 0]
Word: اسراييل سطين , vector: [0 0 0 ... 0 0 0]
Word: اسراييل غزه , vector: [0 0 0 ... 0 0 0]
Word: اسراييل فلسطين , vector: [0 0 0 ... 0 0 0]
Word: اسر

### **FastText**

In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227138 sha256=2a03d415b433c1bc7ad6c776c88b2a1934383501470ed29a35064bc0db250b01
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0


In [None]:
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz"
input_gz_file = '/content/cc.ar.300.bin.gz'
output_bin_file = 'cc.ar.300.bin'

--2024-05-22 17:32:01--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.96, 3.163.189.108, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4500982519 (4.2G) [application/octet-stream]
Saving to: ‘cc.ar.300.bin.gz’


2024-05-22 17:32:47 (93.7 MB/s) - ‘cc.ar.300.bin.gz’ saved [4500982519/4500982519]



In [None]:
import gzip

def decompress_gz_to_bin(gz_file_path, bin_file_path):
    try:
        # Open the .gz file for reading
        with gzip.open(gz_file_path, 'rb') as gz_file:
            # Open or create the .bin file for writing
            with open(bin_file_path, 'wb') as bin_file:
                # Read chunks of data from the .gz file and write to the .bin file
                while True:
                    data = gz_file.read(1024)  # Read 1KB at a time
                    if not data:
                        break  # End of file
                    bin_file.write(data)
        print(f"Decompression successful: {gz_file_path} -> {bin_file_path}")
    except FileNotFoundError:
        print(f"Error: File not found: {gz_file_path}")
    except Exception as e:
        print(f"Error: {e}")

decompress_gz_to_bin(input_gz_file, output_bin_file)

Decompression successful: /content/cc.ar.300.bin.gz -> cc.ar.300.bin


In [None]:
import fasttext

ft = fasttext.load_model(output_bin_file)#('cc.ar.300.bin')



In [None]:
OOV_tokens = []
train_tokens = []
test_tokens = []
test_ft_embeddings = []

def get_doc1_vec(sent, model, data):
  tokens = sent.split()
  ft_embeddings = []
  for word in tokens:
    try:
      if news_train == 'train':
        ft_embeddings.append(model.get_word_vector(word))
        train_tokens.append(word)
      else:
        ft_embeddings.append(model.get_word_vector(word))
        test_tokens.append(word)
    except:
      #print(word, 'does not exist in the model.')
      OOV_tokens.append(word) #if found any -> go back to pre-processing the data
      continue
  if len(ft_embeddings) == 0:
    return None
  return sum(ft_embeddings)/len(ft_embeddings)

In [None]:
X_train_ft_embeddings = X_train['processed_text'].apply(lambda sent: get_doc1_vec(sent, ft,'train'))
X_test_ft_embeddings = X_test['processed_text'].apply(lambda sent: get_doc1_vec(sent, ft,'test'))

In [None]:
import numpy as np
X_train_ft_embeddings_list = []
for embedding in X_train_ft_embeddings:
    if embedding is not None:
        X_train_ft_embeddings_list.append(embedding)
    else:
        # Handle cases where the embedding is None
        X_train_ft_embeddings_list.append(np.zeros_like(X_train_ft_embeddings))

X_test_ft_embeddings_list = []
for embedding in X_test_ft_embeddings:
    if embedding is not None:
        X_test_ft_embeddings_list.append(embedding)
    else:
        # Handle cases where the embedding is None
        X_test_ft_embeddings_list.append(np.zeros_like(X_test_ft_embeddings))


### **Bert**

In [None]:
!pip install transformers



In [None]:
import tensorflow as tf
from nltk.tokenize import word_tokenize
import torch


In [None]:
from transformers import AutoTokenizer, AutoModel
# Specify the pre-trained model name
model_name = "aubmindlab/bert-base-arabert"
# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#X_test['processed_text']

In [None]:
#preprocessed_documents

In [None]:
token_ids_list =[]
for sentence in X_train['processed_text']:
  tokens = tokenizer.tokenize(sentence)
  #print(tokens)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  #print(token_ids)
  token_ids_list.append(token_ids)
#print(token_ids_list)
#len(token_ids_list)

# Find the maximum length of tokenized documents
max_length = max(len(token_ids) for token_ids in token_ids_list)

# Pad the sequences
padded_token_ids_list = [token_ids + [tokenizer.pad_token_id] * (max_length - len(token_ids)) for token_ids in token_ids_list]
#print(padded_token_ids_list)
#print(max_length)
#len(padded_token_ids_list[1])

token_ids_tensor = torch.tensor(padded_token_ids_list)
#len(token_ids_tensor)

# Pass token IDs through the model to get embeddings
with torch.no_grad():
    outputs = model(token_ids_tensor)
# Extract the embeddings
embeddings_bert_X_train = outputs.last_hidden_state
mean_embeddings_bert_X_train = torch.mean(embeddings_bert_X_train, dim=1)

# Print the shape of the embeddings
print(mean_embeddings_bert_X_train.shape)  # Output: torch.Size([1, num_tokens, hidden_size])
print(mean_embeddings_bert_X_train)
#i need to do the mean for each sentence (tokens) from lab 6

torch.Size([4250, 768])
tensor([[-0.1490,  1.1868,  0.2667,  ...,  0.2583,  0.2940,  0.0725],
        [-0.2271,  0.6840, -0.1094,  ..., -0.1598,  0.3695,  0.3029],
        [-0.3242,  1.3685,  0.4638,  ...,  0.5177,  0.2432,  0.7376],
        ...,
        [-0.1692,  0.4645,  0.1335,  ..., -0.3017,  0.5016,  0.3602],
        [-0.1568,  0.5496,  0.0083,  ..., -0.2557,  0.3265,  0.4287],
        [-0.3076,  1.1313,  0.4750,  ...,  0.4929,  0.2663,  0.1684]])


In [None]:
token_ids_list =[]
for sentence in X_test['processed_text']:
  tokens = tokenizer.tokenize(sentence)
  #print(tokens)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  #print(token_ids)
  token_ids_list.append(token_ids)
#print(token_ids_list)
#len(token_ids_list)

# Find the maximum length of tokenized documents
max_length = max(len(token_ids) for token_ids in token_ids_list)

# Pad the sequences
padded_token_ids_list = [token_ids + [tokenizer.pad_token_id] * (max_length - len(token_ids)) for token_ids in token_ids_list]
#print(padded_token_ids_list)
#print(max_length)
#len(padded_token_ids_list[1])

token_ids_tensor = torch.tensor(padded_token_ids_list)
#len(token_ids_tensor)

# Pass token IDs through the model to get embeddings
with torch.no_grad():
    outputs = model(token_ids_tensor)
# Extract the embeddings
embeddings_bert_X_test = outputs.last_hidden_state


In [None]:
mean_embeddings_bert_X_test = torch.mean(embeddings_bert_X_test, dim=1)

# Print the shape of the embeddings
print(mean_embeddings_bert_X_test.shape)  # Output: torch.Size([1, num_tokens, hidden_size])
print(mean_embeddings_bert_X_test)

torch.Size([750, 768])
tensor([[ 0.0963,  0.4743,  0.0387,  ..., -0.2549,  0.4874,  0.1438],
        [ 0.0346,  0.7638,  0.5607,  ..., -0.4772,  0.5471,  0.3590],
        [-0.3641,  1.1202,  0.5697,  ...,  0.3550,  0.4005,  0.2886],
        ...,
        [-0.4267,  0.4340,  0.3929,  ...,  0.0027,  0.0734,  0.0695],
        [-0.1716,  0.3511, -0.1088,  ..., -0.3545,  0.2532,  0.2152],
        [-0.6457,  1.0996,  0.3639,  ...,  0.4253,  0.2711,  0.5477]])


# **Models**

## **LSTM**

### **LSTM TF-IDF**

### **LSTM Bag-Of-Words**

### **LSTM FastText**

### **LSTM Bert**