In [2]:
import os
import re
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

# Path to the main dataset folder

# DATASET_DIR = 'dataset/'
DATASET_DIR = '\examples'


# OUTPUT_DIR = 'output/'
OUTPUT_DIR = '\outputexamples'

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Custom list of Arabic stopwords
arabic_stopwords = [
    'و', 'في', 'من', 'علي', 'عن', 'ما', 'هذا', 'تلك', 'كما', 
    'هي', 'هو', 'انا', 'نحن', 'ان', 'اذا', 'او', 'لكن', 'ثم', 'بل', 'الذي', 
    'التي', 'الذين', 'كل', 'اي', 'بعد', 'قبل', 'كان', 'قد', 'ليس', 'مع', 
    'عند', 'هنا', 'هناك', 'اذ', 'الي', 'ب', 'لم', 'لن', 'لا', 'ان', 'انه', 
    'ايضا', 'هذه', 'هولا', 'انت', 'هم', 'كن', 'انتم', 'اين', 'حين', 
    'كيف', 'كلما', 'بين', 'ام', 'ما', 'انما', 'بعض', 'مثل', 
    'نفس', 'بلا', 'دون', 'حتي', 'اما', 'عبر', 'كلا', 'امام', 'حيث', 
    'اذما', 'سوف', 'عندما', 'كلما', 'الذي', 'اللذين', 'اللتين', 'اللتان', 
    'الاولي', 'ابدا', 'اثنا', 'الان', 'اقل', 'بسبب', 'بما', 'بماذا', 
    'تحت', 'حسب', 'خلال', 'اكثر', 'اضافه', 'عليها', 'فيه', 'اليهم', 
    'علينا', 'ذلك', 'الذي', 'لقد', 'والتي','والذي','وهو','وهي',
    'والذين','ومع','وحين','وحيث','ولقد','وذلك','وبسبب','وفيه','وماذا',
    'وبعض','وتحت','وهنا','ومن','وان','وامام','وكلا','وعلي','واين','وكما',
    'ولكن','وانما','والان','واثنا','وعند','وسوف','وبما','وتلك','وهذا',
    'وما','وعن','وفي','وبل','وليس','واذا','ونحن','وانا','وقد','وكان',
    'وقبل','وبعد','وكل','وانه','ولا','ولن','ولم','والي','واذ','وهناك'
    ,'واين','وانتم','وكن','وهم','وانت','انتي','وانتي','وهولا'
]

# Define prefixes and suffixes
larkey_defarticles = (u"ال", u"وال", u"بال", u"كال", u"فال", u"لل")
larkey_suffixes = (u"ها", u"ان", u"ات", u"ون", u"ين", u"يه", u"ه")

# Function to normalize Arabic text
def normalize_arabic(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  # Remove diacritics
    text = re.sub(r'[إأآا]', 'ا', text)  # Normalize Alef
    text = re.sub(r'ة', 'ه', text)  # Normalize Ta Marbuta
    text = re.sub(r'ى', 'ي', text)  # Normalize Ya
    text = re.sub(r'ء', '', text)  # Remove Hamza
    text = re.sub(r'ؤ', 'و', text)  # Remove Hamza
    text = re.sub(r'[0-9٠-٩]', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation 
    return text

# Custom Arabic sentence tokenizer based on punctuation
def custom_arabic_sent_tokenize(text):
    sentences = re.split(r'[.!؟،:,]\s*', text)
    return [s.strip() for s in sentences if s.strip()]

def remove_stopwords_and_affixes(sentence, stopwords, prefixes, suffixes):
    words = sentence.split()
    filtered_words = []
    
    for word in words:
        # Only add the word if it’s not a stopword
        if word not in stopwords:
            # Remove prefixes if they exist
            for prefix in prefixes:
                if word.startswith(prefix):
                    word = word[len(prefix):]  # Remove the prefix
                    break  # Stop after removing the first matching prefix

            # Remove suffixes if they exist
            for suffix in suffixes:
                if word.endswith(suffix):
                    word = word[:-len(suffix)]  # Remove the suffix
                    break  # Stop after removing the first matching suffix

            # Add the processed word if it’s not empty after removing prefixes/suffixes
            if word.strip():
                filtered_words.append(word)

    return ' '.join(filtered_words)


# Function to process text files and extract features
def process_files():
    all_sentences = []
    all_labels = []
    
    for label_folder in os.listdir(DATASET_DIR):
        label_folder_path = os.path.join(DATASET_DIR, label_folder)
        
        if os.path.isdir(label_folder_path):
            output_label_folder = os.path.join(OUTPUT_DIR, label_folder)
            os.makedirs(output_label_folder, exist_ok=True)
            
            for txt_file in os.listdir(label_folder_path):
                if txt_file.endswith('.txt'):
                    txt_file_path = os.path.join(label_folder_path, txt_file)
                    
                    with open(txt_file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                        normalized_text = normalize_arabic(text)
                        tokenized_sentences = custom_arabic_sent_tokenize(normalized_text)
                        
                        cleaned_sentences = [remove_stopwords_and_affixes(sentence, arabic_stopwords, larkey_defarticles, larkey_suffixes) for sentence in tokenized_sentences]
                        
                        if any(cleaned_sentences):
                            all_sentences.append(' '.join(cleaned_sentences))
                            all_labels.append(label_folder)
                            
                        output_file_path = os.path.join(output_label_folder, f'processed_{txt_file}')
                        with open(output_file_path, 'w', encoding='utf-8') as output_file:
                            output_file.write('\n'.join(cleaned_sentences))

    return all_sentences, all_labels


all_sentences, all_labels = process_files()


In [3]:
import os

def text_to_arff(output_file, dataset_folder):
    # Open the output ARFF file
    with open(output_file, 'w', encoding='utf-8') as arff_file:
        # Write the ARFF header
        arff_file.write("@relation arabic_text_classification\n\n")
        arff_file.write("@attribute text string\n")
        arff_file.write("@attribute class {Culture, Economy, Local, International, Religion, Sports}\n\n")
        arff_file.write("@data\n")
        
        # Iterate over the categories (sub-folders)
        for category in os.listdir(dataset_folder):
            category_folder = os.path.join(dataset_folder, category)
            if os.path.isdir(category_folder):
                # Iterate over all files in the category
                for filename in os.listdir(category_folder):
                    file_path = os.path.join(category_folder, filename)
                    if filename.endswith('.txt'):
                        with open(file_path, 'r', encoding='utf-8') as file:
                            text = file.read().replace('\n', ' ').replace('\r', '')
                            # Write the data to the ARFF file
                            arff_file.write(f"'{text}',{category}\n")

# Path to your dataset and output file
# dataset_folder = r'C:\Users\Lapto\OneDrive\سطح المكتب\NLP assignment\output'
dataset_folder = r'C:\Users\Lapto\OneDrive\سطح المكتب\NLP assignment\outputexamples'

# output_file = r'C:\Users\Lapto\OneDrive\سطح المكتب\NLP assignment\ar_text_dataset.arff'
output_file = r'C:\Users\Lapto\OneDrive\سطح المكتب\NLP assignment\example_articles.arff'

# Convert the dataset to ARFF format
text_to_arff(output_file, dataset_folder)

print(f"Dataset has been saved to {output_file}")


Dataset has been saved to C:\Users\Lapto\OneDrive\سطح المكتب\NLP assignment\example_articles.arff
