### Upload and Extract a Zip File

In [2]:

import zipfile
import os

# Extract a ZIP file
with zipfile.ZipFile('Khaleej-2004.zip', 'r') as zip_ref:
    zip_ref.extractall('destination_folder')



### Extract and List the Contents of a Zip File

In [3]:
import zipfile
import os

# Specify the path to the ZIP file and the extraction directory
zip_file_path = r"C:\Users\shath\Downloads\Khaleej-2004.zip"  # Update this to the full path of your ZIP file
extract_dir = r"C:\Users\shath\Downloads\Khaleej-2004"  # Update this to the desired extraction folder

try:
    # Check if the ZIP file exists
    if not os.path.exists(zip_file_path):
        print(f"File '{zip_file_path}' does not exist.")
    else:
        # Extract the contents of the ZIP file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Create the extraction directory if it doesn't exist
            os.makedirs(extract_dir, exist_ok=True)
            zip_ref.extractall(extract_dir)
            print(f"Files have been extracted to: '{extract_dir}'")

        # List the extracted files
        extracted_files = os.listdir(extract_dir)
        print(f"Number of extracted files: {len(extracted_files)}")
        if extracted_files:
            print("First 5 files:", extracted_files[:5])
        else:
            print("No files were extracted. Please check the contents of the ZIP file.")

        # Check for subdirectories
        for folder in extracted_files:
            folder_path = os.path.join(extract_dir, folder)
            if os.path.isdir(folder_path):
                print(f"Subdirectory: {folder}")

except zipfile.BadZipFile:
    print(f"The file '{zip_file_path}' is corrupted or not a valid ZIP file.")
except Exception as e:
    print(f"An error occurred: {e}")



Files have been extracted to: 'C:\Users\shath\Downloads\Khaleej-2004'
Number of extracted files: 2
First 5 files: ['Khaleej-2004', 'Khaleej-2004.zip']
Subdirectory: Khaleej-2004


In [4]:
import os

# Path to the directory containing the categories
directory_path = r"C:\Users\shath\Downloads\Khaleej-2004\Khaleej-2004"  # Update this path if needed

try:
    # List all folders in the directory
    categories = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))]
    
    # Print the categories
    print(f"Number of Categories: {len(categories)}")
    for category in categories:
        print(f"Category: {category}")

except FileNotFoundError:
    print(f"The directory '{directory_path}' does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

Number of Categories: 4
Category: Economy
Category: International news
Category: Local News
Category: Sports


### Load and Count Files in Categories

In [6]:
import os
import glob  # تأكد من استيراد glob

# التحقق من وجود المجلد
if not os.path.exists(directory_path):
    print(f"المجلد '{directory_path}' غير موجود.")
else:
    # تحميل جميع الملفات النصية
    categories = {}
    total_files = 0

    for folder in os.listdir(directory_path):
        folder_path = os.path.join(directory_path, folder)
        if os.path.isdir(folder_path):
            # البحث عن ملفات HTML فقط
            html_files = glob.glob(os.path.join(folder_path, "*.html"))
            if html_files:  # التأكد من وجود ملفات
                categories[folder] = html_files
                total_files += len(html_files)

    # عرض عدد الملفات لكل فئة
    print(f"إجمالي عدد الملفات: {total_files}")
    print("عدد الملفات حسب الفئة:")
    for category, files in categories.items():
        print(f"الفئة: {category}, عدد الملفات: {len(files)}")
        print(f"أول 5 ملفات في الفئة '{category}': {files[:5]}")  # عرض أول 5 ملفات

    # معالجة حالة عدم وجود ملفات HTML
    if total_files == 0:
        print("لم يتم العثور على أي ملفات HTML في المجلدات.")


إجمالي عدد الملفات: 5690
عدد الملفات حسب الفئة:
الفئة: Economy, عدد الملفات: 909
أول 5 ملفات في الفئة 'Economy': ['C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\Economy\\arc_Articlesww0221.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\Economy\\arc_Articlesww0313.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\Economy\\arc_Articlesww03de.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\Economy\\arc_Articlesww04a5.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\Economy\\arc_Articlesww0521.html']
الفئة: International news, عدد الملفات: 953
أول 5 ملفات في الفئة 'International news': ['C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\International news\\arc_Articlesww02c5.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\International news\\arc_Articlesww05b8.html', 'C:\\Users\\shath\\Downloads\\Khaleej-2004\\Khaleej-2004\\International news\\arc_Articlesww0621.html', 'C:\\Users\\shath\\Do

In [7]:
categories = {}
for folder in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder)
    # استبعاد المجلد الجذر Khaleej-2004
    if os.path.isdir(folder_path) and folder != 'Khaleej-2004':
        categories[folder] = []
        for file_path in os.listdir(folder_path):
            full_path = os.path.join(folder_path, file_path)
            if os.path.isfile(full_path):
                with open(full_path, 'r', encoding='utf-8') as f:
                    categories[folder].append(f.read())


### Read and Preview a Sample Document from Each Category

In [8]:
# Read a sample document from each category
for category, files in categories.items():
    if files:  # Check if there are files in the category
        print(f"Category: {category}")
        try:
            # Open and read the first document
            with open(files[0], 'r', encoding='utf-8') as f:
                sample_content = f.read()[:500]  # Read the first 500 characters
                print("Document Content (First 500 characters):")
                print(sample_content)
                print(f"Character count: {len(sample_content)}")
        except Exception as e:
            print(f"Error reading the file in category '{category}': {e}")

        print(f"Remaining files in category '{category}': {len(files) - 1}")
        print("\n---\n")


Category: Economy
Error reading the file in category 'Economy': [Errno 2] No such file or directory: '  جمعية الاقتصاديين الدور المأمول   حسنا فعلت جمعية الاقتصاديين البحرينية بتنظيم المؤتمر الاقتصادي الأول والذي يركز على الرؤية المستقبلية للاقتصاد الخليجي برمته وليس على اقتصاد البحرين فحسب ان اختيار موضوع المؤتمر واتجاهه الاستراتيجي يعبر عن آفاق ايجابية مشكورة لدى الجمعية حيث انها منذ نشأتها عام لم تتول مسئولية عمل بهذا الحجم والاتجاه   ان تركيز المؤتمر على مستقبل الاقتصاد الخليجي في نصف العقد أو العقد القادم هو اتجاه حميد حيث ان تطور آليات عمل مجلس التعاون تتطلب التركيز على الآفاق الاستراتيجية لخلق اقتصاد خليجي موحد وعدم الاقتصار على التركيز على تعزيز التكامل الاقتصادي الخليجي أو تكامل الأنشطة الاقتصادية الخليجية ذلك ان نقارن بعين استراتيجيا بين خلق قاعدة للاندماج الاقتصادي الخليجي أو التكامل الاقتصادي الخليجي وما تتجه نحو جهود هذا المؤتمر هو تجاوز مرحلة التكامل والبدء بمناقشة متطلبات الاندماج والوحدة في سوق مشتركة الجهود الرسمية كافة ذات الصلة بالاقتصاد الخليجي قد تركزت على التكامل 

### Text Preprocessing, Feature Engineering, and TF-IDF Conversion



In [11]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp


categories = {}

# قراءة الملفات وتصنيفها حسب المجلد
for folder in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder)
    if os.path.isdir(folder_path):
        categories[folder] = []
        for file_path in os.listdir(folder_path):
            full_path = os.path.join(folder_path, file_path)
            try:
                if os.path.isfile(full_path):
                    with open(full_path, 'r', encoding='utf-8') as f:
                        categories[folder].append(f.read())
            except Exception as e:
                print(f"Error reading file: {full_path} - {e}")

# تحويل النصوص إلى DataFrame
data = [{"Category": category, "Text": text} for category, texts in categories.items() for text in texts]
df = pd.DataFrame(data)

# تنظيف النصوص
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Tokenize the cleaned text into words
df['Tokens'] = df['Cleaned_Text'].apply(lambda x: x.split())

# Calculate basic handcrafted features
df['Word_Count'] = df['Tokens'].apply(len)  # Count the number of words
df['Char_Count'] = df['Cleaned_Text'].apply(len)  # Count the number of characters
df['Contains_Economy'] = df['Cleaned_Text'].apply(lambda x: 1 if 'اقتصاد' in x else 0)  # Check if "Economy" exists in text

# List of keywords to use for additional features
keywords = [
    'البورصة', 'الاقتصاد', 'الدولار', 'الأسهم', 'السوق', 'التجارة', 'المال', 'الميزانية', 'النقد', 'الاستثمار',
    'الإنتاج', 'الكرة', 'المباراة', 'الفريق', 'الدوري', 'الكأس', 'الأهداف', 'المنتخب', 'اللعبة', 'الرياضة',
    'التنس', 'الرئيس', 'الحكومة', 'البرلمان', 'مجلس الأمن', 'الأمم', 'الدولة', 'السلطة', 'السياسة', 'القانون',
    'الانتخابات', 
]

# Create Presence and Frequency features for all keywords
presence_features = pd.DataFrame({
    f'Presence_{keyword}': df['Cleaned_Text'].str.contains(keyword).astype(int) for keyword in keywords
})
frequency_features = pd.DataFrame({
    f'Frequency_{keyword}': df['Cleaned_Text'].apply(lambda x: x.count(keyword)) for keyword in keywords
})

# Add all new handcrafted features to the DataFrame
df = pd.concat([df, presence_features, frequency_features], axis=1)

# Extract TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.85)
tfidf_features = vectorizer.fit_transform(df['Cleaned_Text'])

# Combine TF-IDF features with handcrafted features
crafted_features = sp.csr_matrix(df[
    ['Word_Count', 'Char_Count'] +
    [f'Presence_{keyword}' for keyword in keywords] +
    [f'Frequency_{keyword}' for keyword in keywords]
].values)
final_features = sp.hstack([tfidf_features, crafted_features])

# عرض الإطار مع الميزات
print("Dataset with Tokens and Crafted Features:")
display(df.head())

# عرض الكلمات المستخدمة
print(f"Total Keywords Used: {len(keywords)}")
print(f"Keywords: {keywords}")



Dataset with Tokens and Crafted Features:


Unnamed: 0,Category,Text,Cleaned_Text,Tokens,Word_Count,Char_Count,Contains_Economy,Presence_البورصة,Presence_الاقتصاد,Presence_الدولار,...,Frequency_الرئيس,Frequency_الحكومة,Frequency_البرلمان,Frequency_مجلس الأمن,Frequency_الأمم,Frequency_الدولة,Frequency_السلطة,Frequency_السياسة,Frequency_القانون,Frequency_الانتخابات
0,Economy,جمعية الاقتصاديين الدور المأمول حسنا فعلت ...,جمعيه الاقتصاديين الدور المامول حسنا فعلت جمعي...,"[جمعيه, الاقتصاديين, الدور, المامول, حسنا, فعل...",489,3088,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Economy,سهم المجموعة العربية للتأمين يرتفع قياسيا و...,سهم المجموعه العربيه للتامين يرتفع قياسيا ويسج...,"[سهم, المجموعه, العربيه, للتامين, يرتفع, قياسي...",267,1617,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Economy,افتتاح مركز النقد لجيسيك آند ديفريانت في الب...,افتتاح مركز النقد لجيسيك اند ديفريانت في البحر...,"[افتتاح, مركز, النقد, لجيسيك, اند, ديفريانت, ف...",579,3497,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
3,Economy,أسعار النفط تتراجع في نوفمبر بعد ارتفاعاتها...,اسعار النفط تتراجع في نوفمبر بعد ارتفاعاتها ال...,"[اسعار, النفط, تتراجع, في, نوفمبر, بعد, ارتفاع...",340,2008,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,Economy,خلال الفترة بين من الشهر الجاري الاجتماع ال...,خلال الفتره بين من الشهر الجاري الاجتماع السنو...,"[خلال, الفتره, بين, من, الشهر, الجاري, الاجتما...",197,1254,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Total Keywords Used: 31
Keywords: ['البورصة', 'الاقتصاد', 'الدولار', 'الأسهم', 'السوق', 'التجارة', 'المال', 'الميزانية', 'النقد', 'الاستثمار', 'الإنتاج', 'الكرة', 'المباراة', 'الفريق', 'الدوري', 'الكأس', 'الأهداف', 'المنتخب', 'اللعبة', 'الرياضة', 'التنس', 'الرئيس', 'الحكومة', 'البرلمان', 'مجلس الأمن', 'الأمم', 'الدولة', 'السلطة', 'السياسة', 'القانون', 'الانتخابات']


### Selected Top 20 Feature


In [12]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp

# Combine handcrafted features and TF-IDF features
crafted_features = sp.csr_matrix(df[['Word_Count', 'Char_Count', 'Contains_Economy']].values)
final_features = sp.hstack([tfidf_features, crafted_features])

# Encode the target labels into numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# Apply SelectKBest to select top 20 features
selector = SelectKBest(score_func=chi2, k=20)
selected_features = selector.fit_transform(final_features, y)

# Get the scores and indices of the selected features
feature_scores = selector.scores_
selected_indices = selector.get_support(indices=True)

# Display the selected feature indices and scores
print("Selected Top 20 Feature Indices:", selected_indices)
print("Selected Top 20 Feature Scores:", feature_scores[selected_indices])

# Display the names of the selected features
feature_names = list(vectorizer.get_feature_names_out()) + ['Word_Count', 'Char_Count', 'Contains_Economy']
selected_feature_names = [feature_names[i] for i in selected_indices]
print("Selected Feature Names:", selected_feature_names)


Selected Top 20 Feature Indices: [ 155  279  383  506  592  833 1296 1298 1406 1443 1469 1549 1815 2276
 2293 3414 3774 5000 5001 5002]
Selected Top 20 Feature Scores: [1.62295172e+02 1.25897579e+02 1.48455513e+02 1.83086384e+02
 1.75642401e+02 1.44931088e+02 1.63974486e+02 1.33690145e+02
 1.76313719e+02 1.82270107e+02 1.28675740e+02 1.70958537e+02
 1.76802308e+02 1.34469261e+02 1.68853516e+02 1.65953428e+02
 1.75517557e+02 4.96019842e+04 2.78521135e+05 8.92468678e+02]
Selected Feature Names: ['اسرائيل', 'الاتحاد', 'الاسرائيلي', 'الانتخابات', 'البطوله', 'الجيش', 'العراق', 'العراقيه', 'الفلسطينيه', 'القدم', 'القوات', 'المباراه', 'المنتخب', 'بطوله', 'بغداد', 'غزه', 'لكره', 'Word_Count', 'Char_Count', 'Contains_Economy']


# Import required libraries

In [13]:
# Import required libraries
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report


### Combine All Features (TF-IDF + Handcrafted Features)

In [14]:
# Combine all feature names (TF-IDF + handcrafted features)
feature_names = list(vectorizer.get_feature_names_out()) + ['Word_Count', 'Char_Count', 'Contains_Economy']

# Extract all selected feature names
selected_feature_names = [feature_names[i] for i in selected_indices]
print("Selected Feature Names:", selected_feature_names)

# Combine TF-IDF features with handcrafted features
crafted_features = sp.csr_matrix(df[['Word_Count', 'Char_Count', 'Contains_Economy']].values)
final_features = sp.hstack([tfidf_features, crafted_features])


Selected Feature Names: ['اسرائيل', 'الاتحاد', 'الاسرائيلي', 'الانتخابات', 'البطوله', 'الجيش', 'العراق', 'العراقيه', 'الفلسطينيه', 'القدم', 'القوات', 'المباراه', 'المنتخب', 'بطوله', 'بغداد', 'غزه', 'لكره', 'Word_Count', 'Char_Count', 'Contains_Economy']


### Select the Best Features

In [15]:
# Use selected features for training
selected_features = selector.fit_transform(final_features, y)

# Display the selected feature names
print(f"Selected Top {len(selected_indices)} Feature Names:", selected_feature_names)


Selected Top 20 Feature Names: ['اسرائيل', 'الاتحاد', 'الاسرائيلي', 'الانتخابات', 'البطوله', 'الجيش', 'العراق', 'العراقيه', 'الفلسطينيه', 'القدم', 'القوات', 'المباراه', 'المنتخب', 'بطوله', 'بغداد', 'غزه', 'لكره', 'Word_Count', 'Char_Count', 'Contains_Economy']


### Train-Test Split

In [17]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)
print("Training and testing data split successfully.")


Training and testing data split successfully.


### Convert the preprocessed data into a dense matrix for lazy classifiers

In [18]:
# Convert the preprocessed data into a dense matrix for lazy classifiers
import scipy.sparse as sp

# Combine TF-IDF and handcrafted features
crafted_features = sp.csr_matrix(df[['Word_Count', 'Char_Count', 'Contains_Economy']].values)
final_features = sp.hstack([tfidf_features, crafted_features]).toarray()  # Convert to dense

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])


Train Naive Bayes

In [22]:
from lazypredict.Supervised import LazyClassifier


X_train_dense = X_train  
X_test_dense = X_test

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and evaluate models
models, predictions = clf.fit(X_train_dense, X_test_dense, y_train, y_test)

# Display results
print("Model Performance Summary:")
print(models)


 97%|█████████▋| 30/31 [24:37<00:46, 46.41s/it]  

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.123105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 244904
[LightGBM] [Info] Number of data points in the train set: 4552, number of used features: 4975
[LightGBM] [Info] Start training from score -1.863707
[LightGBM] [Info] Start training from score -1.790004
[LightGBM] [Info] Start training from score -0.854426
[LightGBM] [Info] Start training from score -1.376675


100%|██████████| 31/31 [25:08<00:00, 48.66s/it]

Model Performance Summary:
                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LogisticRegression                 0.94               0.94    None      0.94   
LGBMClassifier                     0.94               0.94    None      0.94   
XGBClassifier                      0.94               0.93    None      0.94   
PassiveAggressiveClassifier        0.93               0.93    None      0.93   
NearestCentroid                    0.93               0.93    None      0.93   
Perceptron                         0.93               0.93    None      0.93   
ExtraTreesClassifier               0.93               0.93    None      0.93   
BernoulliNB                        0.92               0.92    None      0.92   
LinearSVC                          0.92               0.92    None      0.92   
CalibratedClassifierCV             0.93               0.92    None      0.93   
RandomForestC




In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Encode labels if necessary
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_features, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "LGBM Classifier": LGBMClassifier(),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

# Ensure that the label_encoder.classes_ are strings
class_names = [str(label) for label in label_encoder.classes_]

# Train and evaluate classifiers
results = {}
for name, model in classifiers.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=class_names)

    # Save results
    results[name] = {"Accuracy": accuracy, "Report": report}

    # Print results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.2f}")
    print(report)
    print("\n---\n")

Training Naive Bayes...

Naive Bayes Results:
Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.88      0.71      0.79       203
           1       0.95      0.96      0.95       193
           2       0.86      0.94      0.90       461
           3       0.99      0.97      0.98       281

    accuracy                           0.91      1138
   macro avg       0.92      0.90      0.90      1138
weighted avg       0.91      0.91      0.91      1138


---

Training LGBM Classifier...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137979 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 240006
[LightGBM] [Info] Number of data points in the train set: 4552, number of used features: 4975
[LightGBM] [Info] Start training from score -1.863707
[LightGBM] [Info] Start training from score -1.790004
[LightGBM] [Info] Start training from score -0.854426
[LightGBM] [Info

### Prediction

In [40]:
# Import necessary libraries
import re
import numpy as np
import scipy.sparse as sp
from sklearn.metrics import classification_report, accuracy_score

# New articles
new_articles = [
    "قالت الحكومة إن الاقتصاد الوطني شهد نموًا كبيرًا في السنوات الأخيرة.",
    "حقق المنتخب الوطني فوزًا مذهلًا في المباراة النهائية لكأس البطولة.",
    "أعلنت الأمم المتحدة عن مبادرات جديدة لدعم الأمن والسلام الدولي."
]

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Clean new articles
cleaned_articles = [clean_text(article) for article in new_articles]

# Transform new articles into features
# TF-IDF features
tfidf_features_new = vectorizer.transform(cleaned_articles)

# Handcrafted features
new_handcrafted_features = np.array([
    [len(article.split()), len(article), 1 if 'اقتصاد' in article else 0]
    for article in cleaned_articles
])

# Combine TF-IDF and handcrafted features
final_features_new = sp.hstack([tfidf_features_new, sp.csr_matrix(new_handcrafted_features)]).toarray()
# Predict using all models
predictions = {}
for name, model in classifiers.items():
    model_predictions = model.predict(final_features_new)
    # Decode numerical predictions back to string categories
    predicted_categories = label_encoder.inverse_transform(model_predictions)
    predictions[name] = predicted_categories

# Display results
for i, article in enumerate(new_articles):
    print(f"Article {i + 1}: {article}")
    for name, predicted_categories in predictions.items():
        print(f"Predicted Category ({name}): {predicted_categories[i]}")
    print("\n---\n")


Article 1: قالت الحكومة إن الاقتصاد الوطني شهد نموًا كبيرًا في السنوات الأخيرة.
Predicted Category (Naive Bayes): Economy
Predicted Category (LGBM Classifier): Local News
Predicted Category (XGB Classifier): Local News

---

Article 2: حقق المنتخب الوطني فوزًا مذهلًا في المباراة النهائية لكأس البطولة.
Predicted Category (Naive Bayes): Sports
Predicted Category (LGBM Classifier): Sports
Predicted Category (XGB Classifier): Sports

---

Article 3: أعلنت الأمم المتحدة عن مبادرات جديدة لدعم الأمن والسلام الدولي.
Predicted Category (Naive Bayes): International news
Predicted Category (LGBM Classifier): Local News
Predicted Category (XGB Classifier): Local News

---



In [25]:
! pip install flask-ngrok flask


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [27]:
import joblib
# Get the trained Naive Bayes model from the 'classifiers' dictionary
naive_bayes_model = classifiers["Naive Bayes"]

# Save the trained model, vectorizer, and label encoder
joblib.dump(naive_bayes_model, "naive_bayes_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']