# Arabic Text Preprocessing

### Install Necessary Libraries

In [None]:
!pip install python-docx
!pip install pyarabic
!pip install qalsadi
!pip install camel-tools
!pip install camel-tools --upgrade

### Import Required Libraries

In [None]:
import docx
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import qalsadi.lemmatizer as ql
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.sentiment import SentimentAnalyzer
import os
from google.colab import drive

### Mount Google Drive and Set Up Environment

In [None]:
drive.mount('/gdrive')
os.environ['CAMELTOOLS_DATA'] = '/gdrive/MyDrive/camel_tools'

### Functions

Read .docx File Function

In [None]:
def read_docx_file(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

docx_file_path = '/path/to/transcipt.docx'
arabic_text = read_docx_file(docx_file_path)
df = pd.DataFrame([arabic_text], columns=['arabic_text'])
df

Arabic and English Punctuation Removal Function

In [None]:
arab_Punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_Punctuations = string.punctuation
listOfPunctuations = arab_Punctuations + english_Punctuations

def Remove_Punctuations_ArEng(text):
    translator = str.maketrans('', '', listOfPunctuations)
    text = text.translate(translator)
    return text

df["arabic_text"] = df["arabic_text"].apply(Remove_Punctuations_ArEng)
df

Remove English Text and Numbers Function

In [None]:
def Remove_EnglishText(text):
    engTEXT = r'[a-zA-Z0-9]+'
    text = re.sub(engTEXT, '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join(text.split())
    return text

df["arabic_text"] = df["arabic_text"].apply(Remove_EnglishText)
df

Remove Arabic Stopwords Function

In [None]:
nltk.download('stopwords')

def Remove_ArabicStopWords(text):
    stopWords = stopwords.words('arabic')
    text = [word for word in text if word not in stopWords]
    return text

df["arabic_text"] = df["arabic_text"].apply(Remove_ArabicStopWords)
df

Arabic Text Normalization Function

In [None]:
def Normalize_Arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    return text

df["arabic_text"] = df["arabic_text"].apply(Normalize_Arabic)
df

Arabic Text Tokenization Function

In [None]:
nltk.download('punkt')

def tokenize_arabic(text):
    tokens = word_tokenize(text)
    return tokens

df["tokenized_text"] = df["arabic_text"].apply(tokenize_arabic)
df

### Collect All Words and Create Word Frequency

In [None]:
all_words = []
for text in df["tokenized_text"]:
    for word in text:
        if word.isalnum():
            all_words.append(word)

print(all_words)

from collections import Counter
word_freq = Counter(all_words)
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])
word_freq_df

### Sentiment Analysis Using CAMeL Tools

In [None]:
sa = SentimentAnalyzer.pretrained()
sentences = sent_tokenize(" ".join([str(text) for text in df["arabic_text"]]))
sentiments = [sa.predict(sentence) for sentence in sentences]
sentiment_scores = [1 if s == 'positive' else -1 if s == 'negative' else 0 for s in sentiments]
sentiment_df = pd.DataFrame(list(zip(sentences, sentiments, sentiment_scores)), columns=['Sentence', 'Sentiment', 'Score'])
print(SentimentAnalyzer.pretrained_list())

### Topic Modeling Using LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='arabic')
doc_term_matrix = vectorizer.fit_transform(sentences)
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)
topics = LDA.transform(doc_term_matrix).argmax(axis=1)
topic_df = pd.DataFrame(list(zip(sentences, topics)), columns=['Sentence', 'Topic'])
topic_df

### Named Entity Recognition Using CAMeL Tools

In [None]:
from camel_tools.ner import NERecognizer

ner = NERecognizer.pretrained()
entities = ner.predict(" ".join(sentences))
entity_list = [(e['text'], e['type']) for e in entities]
entity_df = pd.DataFrame(entity_list, columns=['Entity', 'Type'])
entity_df

### Export Results to CSV

In [None]:
word_freq_df.to_csv('word_frequency.csv', index=False)
sentiment_df.to_csv('sentiments.csv', index=False)
topic_df.to_csv('topics.csv', index=False)
entity_df.to_csv('entities.csv', index=False)