# Arabic Topic Modelling 

### Install Necessary Libraries

In [None]:
!pip install gensim nltk pyarabic matplotlib pyLDAvis

### Import Libraries

In [None]:
import nltk
from nltk.corpus import stopwords
import pyarabic.araby as araby
from gensim import corpora, models

# Download necessary NLTK data (stopwords)
nltk.download('stopwords')

### Sample Data 

In [None]:
# Sample data: List of Arabic texts related to business
documents = [
    "الذكاء الاصطناعي يحلل البيانات الضخمة لاتخاذ قرارات استراتيجية في الأعمال",
    "تستخدم الشركات المساعدات الذكية لتحسين خدمة العملاء وزيادة الكفاءة",
    "يمكن للذكاء الاصطناعي التنبؤ بالاتجاهات السوقية وتحليل سلوك المستهلكين بدقة",
    "تعتمد الشركات على الذكاء الاصطناعي لتحسين إدارة المخزون وتقليل التكاليف التشغيلية",
    "يساهم الذكاء الاصطناعي في تحويل الأعمال الرقمية وتقديم منتجات جديدة مبتكرة"
]

### Text Preprocessing Function

In [None]:
# Function to preprocess Arabic text
def preprocess(text):
    tokens = araby.tokenize(text)  # Tokenize the Arabic text
    tokens = [araby.strip_tashkeel(word) for word in tokens]  # Remove diacritics
    # Filter out stopwords and non-Arabic words
    tokens = [word for word in tokens if word not in stopwords.words('arabic') and araby.is_arabicword(word)]
    return tokens

### Preprocess Text

In [None]:
# Process the documents
texts = [preprocess(document) for document in documents]

### Create Dictionary and Corpus

In [None]:
# Create a dictionary and a corpus
dictionary = corpora.Dictionary(texts)  # Map words to IDs
corpus = [dictionary.doc2bow(text) for text in texts]  # Convert texts to bag-of-words format

### Apply LDA (Latent Dirichlet Allocation) for Topic Modeling

In [None]:
# Apply LDA model with 3 topics
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15, random_state=1)

### Print the Top Words Associated with Each Topic

In [None]:
# Extract and print the top words for each topic
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    topic_words = topic[1].split(" + ")
    cleaned_words = [word.split("*")[1].replace('"', '') for word in topic_words]
    print(f"Topic {topic[0]}: {', '.join(cleaned_words)}")