In [4]:
import os
import json
import re
from tqdm import tqdm  # For progress bar
from num2words import num2words
import dateparser  # For time-related text conversion
import hashlib
from collections import defaultdict

# Define the root directory containing all the JSON files
root_dir = "../dataset/text"
output_file = "../dataset/text_cleaned/data.jsonl"

# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Символы, которые нужно удалить
UNWANTED_SYMBOLS = ["$", "#", "*", ">", "<", "\n", "-"]

# Define punctuation characters to normalize
PUNCTUATION_CHARS = ",.!?;:…"

# Define sentence separators (after these, capital letters are allowed)
SENTENCE_SEPARATORS = ".!?\n"

# Словарь для замены числовых указателей на текстовые аналоги с запятой
ORDINAL_MAP = {
    "1": "во-первых,",
    "2": "; во-вторых,",
    "3": "; в-третьих,",
    "4": "; в-четвёртых,",
    "5": "; в-пятых,",
    "6": "; в-шестых,",
    "7": "; в-седьмых,",
    "8": "; в-восьмых,",
    "9": "; в-девятых,",
    "10": "; в-десятых,",
}

# Chinese character detection code
LHan = [[0x2E80, 0x2E99],    # Han # So  [26] CJK RADICAL REPEAT, CJK RADICAL RAP
        [0x2E9B, 0x2EF3],    # Han # So  [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
        [0x2F00, 0x2FD5],    # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
        0x3005,              # Han # Lm       IDEOGRAPHIC ITERATION MARK
        0x3007,              # Han # Nl       IDEOGRAPHIC NUMBER ZERO
        [0x3021, 0x3029],    # Han # Nl   [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
        [0x3038, 0x303A],    # Han # Nl   [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
        0x303B,              # Han # Lm       VERTICAL IDEOGRAPHIC ITERATION MARK
        [0x3400, 0x4DB5],    # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
        [0x4E00, 0x9FC3],    # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
        [0xF900, 0xFA2D],    # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
        [0xFA30, 0xFA6A],    # Han # Lo  [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
        [0xFA70, 0xFAD9],    # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
        [0x20000, 0x2A6D6],  # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
        [0x2F800, 0x2FA1D]]  # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Define Chinese punctuation characters - FIXED version without dash-ranges
chinese_punctuation = "，。！？；：""''「」【】《》〈〉（）［］｛｝…～"

def build_re():
    """Build regex for Chinese character detection"""
    print("Building Chinese character detection regex...")
    L = []
    for i in LHan:
        if isinstance(i, list):
            f, t = i
            f = chr(f)
            t = chr(t)
            L.append(f'{f}-{t}')
        else:
            L.append(chr(i))
    
    # Add Chinese punctuation
    for char in chinese_punctuation:
        L.append(re.escape(char))  # Properly escape each character
    
    RE = '[%s]' % ''.join(L)
    return re.compile(RE, re.UNICODE)

# Initialize Chinese character detection
chinese_detector = build_re()

def is_chinese_char(char):
    """Check if a character is Chinese (including punctuation)"""
    return chinese_detector.match(char) is not None or char in chinese_punctuation

def remove_pinyin_guides(text):
    """Remove romanized pronunciation guides like (zài jiàn) from the text"""
    # Pattern to match content within parentheses that contains pinyin-like characters
    pinyin_pattern = r'\([a-zA-Z\s\u0300-\u036Fàáèéìíòóùúüāēěīōūǎǐǒǔǚǜ]+\)'
    
    # Remove the pinyin guides
    cleaned_text = re.sub(pinyin_pattern, '', text)
    
    # Fix any double spaces created by the removal
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text

def split_by_language(text):
    """
    Split text into Chinese and Russian segments,
    keeping consecutive Chinese characters together
    """
    segments = []
    
    # Process text character by character
    i = 0
    while i < len(text):
        # Find the start of a Chinese segment
        if is_chinese_char(text[i]):
            start = i
            i += 1
            
            # Continue until we find a non-Chinese character
            while i < len(text) and is_chinese_char(text[i]):
                i += 1
                
            # Check if the next characters include punctuation that should stay with Chinese
            while i < len(text) and text[i] in "!,.?;:":
                i += 1
                
            segments.append({"text": text[start:i], "lang": "zh-cn"})
        else:
            # Find the end of the non-Chinese segment
            start = i
            i += 1
            
            while i < len(text) and not is_chinese_char(text[i]):
                i += 1
                
            segments.append({"text": text[start:i], "lang": "ru"})
    
    # Merge adjacent segments of the same language
    merged_segments = []
    current_lang = None
    current_text = ""
    
    for segment in segments:
        if current_lang is None:
            current_lang = segment["lang"]
            current_text = segment["text"]
        elif current_lang == segment["lang"]:
            current_text += segment["text"]
        else:
            merged_segments.append({"text": current_text.strip(), "lang": current_lang})
            current_lang = segment["lang"]
            current_text = segment["text"]
            
    if current_text:
        merged_segments.append({"text": current_text.strip(), "lang": current_lang})
    
    # Filter out empty segments and segments with only punctuation
    filtered_segments = []
    for seg in merged_segments:
        # Skip empty segments
        if not seg["text"].strip():
            continue
            
        # Skip segments with only punctuation
        if all(char in "-−------!,.?;: " for char in seg["text"]):
            continue
            
        filtered_segments.append(seg)
    
    return filtered_segments

# Функция для замены числовых указателей (например, '1.') на текстовые аналоги
def replace_ordinal_bullets(text):
    return re.sub(r'\b(\d+)\.', lambda x: ORDINAL_MAP.get(x.group(1), x.group(0)), text)

# Функция для конвертации чисел в текст
def convert_numbers_to_words(text):
    # Округляем числа, чтобы избежать фраз типа "четыре целых ноль десятых"
    return re.sub(r'\d+(\.\d+)?', lambda x: num2words(round(float(x.group())), lang="ru"), text)

# Функция для конвертации времени в текст
def convert_time_to_text(text):
    return re.sub(r'\b\d{1,2}:\d{2}\b', lambda x: dateparser.parse(x.group()).strftime("%I:%M %p"), text)

# Функция для удаления нежелательных символов
def remove_unwanted_symbols(text):
    for symbol in UNWANTED_SYMBOLS:
        text = text.replace(symbol, " ")
    return text.strip()

# Функция для замены нескольких пробелов на один
def replace_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Функция для удаления пробелов перед определёнными символами
def remove_spaces_before_symbols(text, symbols=",.!?;:"):
    # Регулярное выражение для поиска пробелов перед указанными символами
    pattern = rf'\s+([{re.escape(symbols)}])'
    return re.sub(pattern, r'\1', text)

# NEW FUNCTION: Replace consecutive punctuation with last punctuation + space
def normalize_punctuation(text):
    """Replace consecutive punctuation with only the last punctuation character + space"""
    # Create a regex pattern for any character in PUNCTUATION_CHARS followed by spaces and punctuation
    pattern = r'([' + re.escape(PUNCTUATION_CHARS) + r'][\s' + re.escape(PUNCTUATION_CHARS) + r']+)'
    
    def replace_punct_sequence(match):
        # Get the matched punctuation sequence
        sequence = match.group(1)
        # Take the last punctuation character
        last_char = None
        for char in reversed(sequence):
            if char in PUNCTUATION_CHARS:
                last_char = char
                break
        if last_char:
            return last_char + " "
        return sequence
    
    return re.sub(pattern, replace_punct_sequence, text)

# NEW FUNCTION: Escape sentence splitters
def escape_sentence_splitters(text):
    """Replace sentence splitters like '. ' with '\. '"""
    # Replace period + space with escaped period + space
    text = re.sub(r'\. ', r'\. ', text)
    # Same for other sentence ending punctuation
    text = re.sub(r'! ', r'\! ', text)
    text = re.sub(r'\? ', r'\? ', text)
    return text

# NEW FUNCTION: Convert uppercase to lowercase except for sentence beginnings
def normalize_capitalization(text):
    """
    Convert uppercase characters to lowercase except for:
    1. First letter of the text
    2. First letter after sentence separators
    """
    if not text:
        return text
        
    result = []
    should_be_upper = True  # First character should be uppercase
    
    for i, char in enumerate(text):
        if should_be_upper and char.isalpha():
            result.append(char.upper())
            should_be_upper = False
        elif (i > 0 and text[i-1] in SENTENCE_SEPARATORS) and char.isalpha():
            # If the previous character was a sentence separator, keep uppercase
            result.append(char)
            should_be_upper = False
        else:
            # Otherwise, make uppercase characters lowercase
            if char.isupper():
                result.append(char.lower())
            else:
                result.append(char)
                
        # If we find a sentence separator, the next alphabetic character should be uppercase
        if char in SENTENCE_SEPARATORS:
            should_be_upper = True
    
    return ''.join(result)

# Основная функция обработки контента
def process_content(content):
    content = remove_unwanted_symbols(content)  # Удаление нежелательных символов
    content = replace_multiple_spaces(content)  # Замена нескольких пробелов на один
    content = replace_ordinal_bullets(content)  # Замена числовых указателей
    content = convert_numbers_to_words(content)  # Конвертация чисел в текст
    content = convert_time_to_text(content)  # Конвертация времени в текст
    content = remove_spaces_before_symbols(content)  # Удаление пробелов перед символами
    content = remove_pinyin_guides(content)  # Remove pinyin guides
    content = normalize_punctuation(content)  # Normalize consecutive punctuation
    content = escape_sentence_splitters(content)  # Escape sentence splitters
    content = normalize_capitalization(content)  # NEW: Fix capitalization issues
    return content

# Process for Chinese language detection and segmentation
def process_and_segment_content(content):
    # First apply the standard text processing
    processed_content = process_content(content)
    
    # Then detect language segments
    segments = split_by_language(processed_content)
    
    return {
        "processed_content": processed_content,
        "language_segments": segments
    }

# Список всех обработанных разговоров
all_conversations = []

# Поиск всех JSON-файлов в директории (исключая system_prompt.json)
file_paths = []
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".json") and file != "system_prompt.json":  # Исключение system_prompt.json
            file_paths.append(os.path.join(subdir, file))

print(f"Found {len(file_paths)} JSON files to process")

# Counter for files with Chinese characters
chinese_files_count = 0

# Add these configuration parameters at the top of your script
REJECT_IF_FIRST_MESSAGE_DUPLICATE = True  # Reject if first message matches another conversation
DUPLICATE_THRESHOLD = 1  # Maximum number of duplicate messages allowed per conversation

# Initialize tracking collections
first_message_hashes = set()  # To track first messages
message_hashes = defaultdict(int)  # To count each message occurrence
duplicate_counts_first_message = 0
duplicate_counts_threshold = 0


# Modify the file processing loop
for file_path in tqdm(file_paths, desc="Processing files"):
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            # Ensure JSON data is a list
            if isinstance(data, list) and data:  # Check that data is not empty
                has_chinese = False
                skip_conversation = False
                
                # Check if first message is a duplicate
                if REJECT_IF_FIRST_MESSAGE_DUPLICATE and data and "content" in data[0]:
                    first_message_hash = hashlib.md5(data[0]["content"].encode('utf-8')).hexdigest()
                    if first_message_hash in first_message_hashes:
                        duplicate_counts_first_message += 1
                        skip_conversation = True
                    else:
                        first_message_hashes.add(first_message_hash)
                
                # Count duplicate messages in this conversation
                if not skip_conversation:
                    duplicate_message_count = 0
                    
                    # Check each message in the conversation
                    for item in data:
                        if isinstance(item, dict) and "content" in item:
                            content_hash = hashlib.md5(item["content"].encode('utf-8')).hexdigest()
                            
                            if message_hashes[content_hash] > 0:
                                duplicate_message_count += 1
                            
                            message_hashes[content_hash] += 1
                    
                    # Skip if exceeds duplicate threshold
                    if duplicate_message_count > DUPLICATE_THRESHOLD:
                        duplicate_counts_threshold += 1
                        skip_conversation = True
                
                # Process the conversation if it wasn't skipped
                if not skip_conversation:
                    # Process each message
                    for item in data:
                        if isinstance(item, dict) and "content" in item:
                            result = process_and_segment_content(item["content"])
                            item["content"] = result["processed_content"]
                            item["language_segments"] = result["language_segments"]
                            
                            # Check for Chinese content
                            for segment in result["language_segments"]:
                                if segment["lang"] == "zh-cn":
                                    has_chinese = True
                    
                    all_conversations.append(data)
                    
                    if has_chinese:
                        chinese_files_count += 1
            else:
                raise ValueError("Unexpected JSON format (not a list or empty list)")
        except Exception as e:
            # Print error information
            print(f"\nError processing file: {file_path}")
            print(f"Error: {e}")
            try:
                f.seek(0)
                print(f"Content:\n{f.read()}\n")
            except Exception as read_error:
                print(f"Could not read file content: {read_error}\n")


print(f"Processing complete. Found {len(all_conversations)} conversations")
print(f"Files containing Chinese characters: {chinese_files_count}")

# Add these to your final output statistics
print(f"Removed {duplicate_counts_first_message} conversations with duplicate first messages")
print(f"Removed {duplicate_counts_threshold} conversations exceeding duplicate message threshold")
print(f"Total removed conversations: {duplicate_counts_first_message + duplicate_counts_threshold}")

# Сохранение обработанных данных в JSONL-файл
with open(output_file, "w", encoding="utf-8") as f:
    for conversation in tqdm(all_conversations, desc="Saving conversations"):
        f.write(json.dumps(conversation, ensure_ascii=False) + "\n")

print(f"Processed conversations saved to {output_file}")

Building Chinese character detection regex...
Found 11817 JSON files to process


Processing files: 100%|██████████| 11817/11817 [00:06<00:00, 1745.55it/s]


Processing complete. Found 4240 conversations
Files containing Chinese characters: 4182
Removed 7569 conversations with duplicate first messages
Removed 8 conversations exceeding duplicate message threshold
Total removed conversations: 7577


Saving conversations: 100%|██████████| 4240/4240 [00:00<00:00, 15271.63it/s]

Processed conversations saved to ../dataset/text_cleaned/data.jsonl





In [7]:
all_conversations[2001]

[{'role': 'user',
  'content': 'Как можно использовать выражение "谢谢" в разных ситуациях\\? Есть ли формальные и неформальные варианты благодарности?',
  'language_segments': [{'text': 'Как можно использовать выражение "',
    'lang': 'ru'},
   {'text': '谢谢', 'lang': 'zh-cn'},
   {'text': '" в разных ситуациях\\? Есть ли формальные и неформальные варианты благодарности?',
    'lang': 'ru'}]},
 {'role': 'assistant',
  'content': 'Выражение "谢谢" (спасибо) можно использовать в различных ситуациях\\. Вот несколько примеров: во-первых, неформальная благодарность: когда друг помогает вам: 朋友帮我，我说：谢谢！ при получении подарка: 收到礼物，我说：谢谢！; во-вторых, формальная благодарность: в разговоре с учителем или начальником: 老师帮助我，我说：谢谢您。 при обращении к незнакомцу: 乘客帮我，我说：谢谢您。; в-третьих, дополнительные варианты: "多谢" (большое спасибо) — можно использовать в неформальной обстановке, когда хотите выразить большую благодарность\\. "谢谢你" (спасибо тебе) — более личное, для друзей и близких\\. Таким образом,