## Import necessary libraries

In [None]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Getting The Data

In [None]:
# Function to load data from folders
def load_data_from_folders(main_folder_path):
    data = []
    for category_folder in os.listdir(main_folder_path):
        category_label = category_folder  # Assuming folder name is the category label
        category_path = os.path.join(main_folder_path, category_folder)
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append({'text': content, 'category': category_label})
    return pd.DataFrame(data)

# Main folder containing subfolders for each category
main_folder_path = '/kaggle/input/sanad-dataset'

# Load data from folders
WholeData = load_data_from_folders(main_folder_path)

In [None]:
WholeData.head(10)

Unnamed: 0,text,category
0,"دبي - ""الخليج"":حصدت شعاع كابيتال جائزة ""أفضل ش...",Finance
1,أبوظبي - علي أسعد: تراجعت أسواق المال في الدول...,Finance
2,استأنفت أسواق الأسهم المحلية أمس تحركها باتجاه...,Finance
3,دبي «الخليج»: أعلنت شركة تكافل الإمارات عن بدء...,Finance
4,تحتفل شركة طاقة الخليج البحرية، التي تتخذ من د...,Finance
5,تأجيل عمومية الدارأبوظبي - الخليج: تأجل اجتماع...,Finance
6,غلبت السلبية والإغلاقات الحمراء على مؤشرات أسو...,Finance
7,أبوظبي «الخليج»: ارتفع مؤشر سوق الإمارات المال...,Finance
8,أبوظبي:«الخليج» واصل مؤشر سوق أبوظبي للأوراق ا...,Finance
9,أبوظبي الخليج: أعاد بنك الشارقة شراء 5،5 مليون...,Finance


## The Preproccing Stage

### preprocessText Function

In [None]:
StopWords = set(stopwords.words('arabic'))

ArabicDiacritics = r"""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                            ـ    | # Tatwil/Kashida
                     """

RegrexPattern = (
    "\U0001F600-\U0001F64F"+  # emoticons {😀 , 😆}
    "\U0001F300-\U0001F5FF"+  # symbols & pictographs {🌍 , 🌞}
    "\U0001F680-\U0001F6FF"+  # transport & map symbols {🚌 , 🚕 }
    "\U0001F1E0-\U0001F1FF"   # flags (iOS) { 🇺🇸 , 🇨🇦 }
)

stop_words = list(StopWords)

In [None]:
def preprocessText(text):
    # Remove special characters {& $ @} and punctuation {. , ? !}
    text = re.sub(r'[^\w\s]', '', text)

    # Remove Arabic diacritics
    text = re.sub(ArabicDiacritics, '', text)

    # Remove emoji characters
    text = re.sub(f"[{RegrexPattern}]", '', text)

    # Tokeniz The Sentence into tokens
    Tokens = word_tokenize(text)

    Tokens = [word for word in Tokens if word not in StopWords and len(word) > 1]

    PreprocessedText = ' '.join(Tokens)

    return PreprocessedText

### Now Apply The preprocessText Function At The Whole Data

In [None]:
WholeData['PreprocessedText'] = WholeData['text'].apply(preprocessText)

In [None]:
CleanData = WholeData[['category', 'PreprocessedText']]

In [None]:
CleanData.sample(5)

### Encoded the Category

In [None]:
label_encoder = LabelEncoder()

CleanData['Category'] = label_encoder.fit_transform(CleanData['category'])

LabelMapping = dict(zip(CleanData['Category'], CleanData['category']))
print(f"Labels With Category :\n{LabelMapping}")

In [None]:
FinalData= CleanData[['Category', 'PreprocessedText']]
FinalData.sample(10)

### Splitting The Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(FinalData['PreprocessedText'] ,FinalData['Category'] , test_size=0.2, random_state=42)

print("X_train set shape:", X_train.shape)
print("X_test set shape:", X_test.shape)
print("y_train set shape:", y_train.shape)
print("y_test set shape:", y_test.shape)

In [None]:
def Dictionary_based_Classifier(text):
    # Keywords by category
    category_keywords = {
        'Finance': ['اقتصاد', 'بنك', 'المالية', 'استثمار', 'سوق'],
        'Sports': ['كرة القدم', 'الألعاب الرياضية', 'دوري', 'مباراة', 'رياضة'],
        'Tech': ['تكنولوجيا', 'برمجيات', 'الذكاء الاصطناعي', 'حوسبة', 'شبكات'],
        'Medical': ['طب', 'صحة', 'دواء', 'علاج', 'مستشفى'],
        'Politics': ['سياسة', 'انتخابات', 'حكومة', 'مجلس', 'قانون'],
        'Culture': ['فن', 'مسرح', 'ثقافة', 'أدب', 'تاريخ'],
        'Religion': ['دين', 'إسلام', 'مسجد', 'شريعة', 'صلاة']
    }

    # Tokenize the text
    tokens = word_tokenize(text)

    # Category scoring
    scores = {category: 0 for category in category_keywords}

    # Score each category based on occurrence of keywords
    for token in tokens:
        for category, keywords in category_keywords.items():
            if token in keywords:
                scores[category] += 1

    # Determine the best category
    predicted_category = max(scores, key=scores.get)
    return predicted_category

In [None]:
# Predict categories using the rule-based classifier
Dictionary_based_predictions = [Dictionary_based_Classifier(text) for text in X_test]

# Convert predicted categories from labels to integers using the label encoder
Dictionary_based_predictions_encoded = label_encoder.transform(Dictionary_based_predictions)

# Generate a classification report for the rule-based classifier
Dictionary_based_report = classification_report(y_test, Dictionary_based_predictions_encoded)
print("\nDictionary-Based Classification Report:")
print(Dictionary_based_report)

## The Modelling Stage

### Logistic Regression

In [None]:
pipeLine = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

#### Train The Model

In [None]:
pipeLine.fit(X_train, y_train)

#### Evaluate The Model

In [None]:
Accuracy = pipeLine.score(X_test, y_test)
print(f"Accuracy:{round(Accuracy*100, 2)}%")

Predictions = pipeLine.predict(X_test)

Report = classification_report(y_test, Predictions)
print("\nClassification Report:")
print(Report)

## Team Members

- **Rayan Beshawri**
- **Abdulaziz Dawood**
- **Ahmed Salem**
- **Hassan Kalantan**