In [1]:
import json
import re
import time
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from zeyrek import MorphAnalyzer
from nltk.corpus import stopwords

# Configure logging (minimal logging for performance)
import logging
logging.basicConfig(
    filename='process_preprocessing_v3.log',
    filemode='w',  # Overwrite the log file each run
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.FATAL
)

# Load NLTK Turkish stop words
try:
    nltk_stop_words = stopwords.words('turkish')
except LookupError:
    import nltk
    nltk.download('stopwords')
    nltk_stop_words = stopwords.words('turkish')

# Custom stop words
custom_stop_words = [
    'karar', 'mahkemesi', 'edildiği', 'şöyledir', 'hakkının', 'hukuk',
    'hakkında', 'ifade', 'tarihli', 'olarak', 'ceza', 'kararı', 'başvuru',
    'başvurucunun', 'üzerine', 'yapılan', 'tarafından', 'kabul', 'verilmiştir',
    'kanun', 'tarihinde', 'dava', 'nedeniyle', 'ilişkin', 'maddesinin', 'ilgili',
    'başvurucu', 'yer', 'terör', 'sayılı', 'olan', 'ihlal', 'olduğu', 'nin'
]

# Combine stop words and compile regex
all_stop_words = set(nltk_stop_words + custom_stop_words)
stop_words_regex = re.compile(r'\b(?:' + '|'.join(map(re.escape, all_stop_words)) + r')\b', re.IGNORECASE)

# Global MorphAnalyzer for worker processes
analyzer = None


def load_json_file(file_path):
    """Load JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def save_json_file(data, file_path):
    """Save JSON file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def clean_text(text):
    """Cleans the text by removing punctuation, digits, stop words, and converting to lowercase."""
    text = re.sub(r'\d+', ' ', text)  # Remove digits and numbers
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = stop_words_regex.sub('', text)  # Remove stop words
    return text.lower()


def lemmatize_text(text):
    """Lemmatizes a single text."""
    global analyzer
    cleaned_text = clean_text(text)
    words = cleaned_text.split()
    lemmatized_words = []
    for word in words:
        lemmas = analyzer.lemmatize(word)
        if lemmas:
            lemmatized_words.append(lemmas[0][1][0])  # Append the first lemma
        else:
            lemmatized_words.append(word)  # Fallback to the original word
    #print(' '.join(lemmatized_words))
    return ' '.join(lemmatized_words)


def process_texts_batch(batch):
    """Processes a batch of texts for lemmatization using the global analyzer."""
    processed_batch = []
    for item in batch:
        processed_item = {}
        for field in ['text', 'Başvuru Konusu']:
            processed_item[field] = lemmatize_text(item[field])
        processed_batch.append(processed_item)
    return processed_batch


def worker_init():
    """Initializes the global MorphAnalyzer in workers."""
    global analyzer
    analyzer = MorphAnalyzer()


def process_dataset_in_batches(dataset, batch_size=100):
    """Processes the dataset in batches using multiprocessing."""
    with Pool(processes=cpu_count(), initializer=worker_init) as pool:
        # Split dataset into batches
        batches = [dataset[i:i + batch_size] for i in range(0, len(dataset), batch_size)]
        
        # Parallel processing
        results = list(tqdm(pool.imap(process_texts_batch, batches),
                            total=len(batches), desc="Processing Dataset"))

    # Flatten results
    return [item for batch in results for item in batch]


def main():
    start_time = time.time()

    # Load datasets
    datasets = {
        "train": load_json_file('Dataset/train.json'),
        "test": load_json_file('Dataset/test.json'),
        "dev": load_json_file('Dataset/dev.json')
    }

    # Process datasets
    for key, data in datasets.items():
        processed_data = process_dataset_in_batches(data)
        
        # Update dataset with processed results
        for i, item in enumerate(data):
            item['text'] = processed_data[i]['text']
            item['Başvuru Konusu'] = processed_data[i]['Başvuru Konusu']

        # Save processed dataset
        save_json_file(data, f'{key}_processed.json')

    print(f"Processing completed in {time.time() - start_time:.2f} seconds.")


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.critical(f"Critical error occurred: {e}")


Processing Dataset: 100%|██████████| 120/120 [23:47<00:00, 11.90s/it] 
Processing Dataset: 100%|██████████| 7/7 [01:59<00:00, 17.08s/it] 
Processing Dataset: 100%|██████████| 7/7 [02:00<00:00, 17.25s/it] 

Processing completed in 1671.29 seconds.



