In [None]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
import zipfile
import os
import re
import csv
from bs4 import BeautifulSoup, NavigableString
from collections import namedtuple

metadata = []
FileInfo = namedtuple('FileInfo', ['file_path', 'lang', 'tokens', 'lines'])
archives = ['hye-rus.zip', 'rus-hye.zip']
extract_dirs = ['hye-rus', 'rus-hye']
for archive, extract_dir in zip(archives, extract_dirs):
    with zipfile.ZipFile(archive, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)


def extract_texts_from_xml_bs4(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        soup = BeautifulSoup(content, 'xml')
    except Exception as e:
        print(f"Ошибка при обработке {file_path}: {e}")
        return None, None

    rus_texts = []
    hye_texts = []

    for para in soup.find_all('para'):
        # Собираем русский текст
        for se_rus in para.find_all('se', lang='rus'):
            text_rus = se_rus.get_text(strip=False)
            if text_rus:
                rus_texts.append(text_rus)

        # Собираем армянский текст
        for se_hye in para.find_all('se', lang='hye'):
            hye_parts = []
            for element in se_hye.children:
                if isinstance(element, NavigableString): #сохраняем оригинальную пунктуацию
                    text = str(element)
                    if text:
                        hye_parts.append(text)
                elif element.name == 'w':
                    hye_parts.append(element.get_text(strip=False))
            if hye_parts:
                hye_texts.append(''.join(hye_parts))

    rus_text = '\n'.join(rus_texts)
    hye_text = '\n'.join(hye_texts)
    return rus_text, hye_text

# Создание папки для чистых текстов
output_dir = 'texts'
os.makedirs(output_dir, exist_ok=True)


# Обработка всех XML файлов в папке
def process_all_xmls_bs4(folder):
    for root_dir, _, files in os.walk(folder):
        for file in files:
            if file.lower().endswith('.xml'):
                xml_path = os.path.join(root_dir, file)
                rus_text, hye_text = extract_texts_from_xml_bs4(xml_path)
                rel_path = os.path.relpath(xml_path, folder)
                base_name = os.path.splitext(rel_path)[0]
                output_path_rus = os.path.join(output_dir, base_name + '_ru.txt')
                output_path_hye = os.path.join(output_dir, base_name + '_am.txt')
                os.makedirs(os.path.dirname(output_path_rus), exist_ok=True)
                if rus_text:
                    txt_filename_rus = os.path.splitext(file)[0] + '_ru.txt'
                    with open(output_path_rus, 'w', encoding='utf-8') as f_out:
                        f_out.write(rus_text)
                    #metadata.append(FileInfo(
                    #  file_path=output_path_rus,
                     # lang='rus',
                     # tokens=len(rus_text.split()),
                     # lines=len(rus_text.splitlines())
                      # ))
                if hye_text:
                    txt_filename_hye = os.path.splitext(file)[0] + '_am.txt'
                    with open(output_path_hye, 'w', encoding='utf-8') as f_out:
                        f_out.write(hye_text)
                  #  metadata.append(FileInfo(
                   #     file_path=output_path_hye,
                    #    lang='hye',
                     #   tokens=len(hye_text.split()),
                      #  lines=len(hye_text.splitlines())
                   # ))

# Обработать оба архива
for folder in extract_dirs:
    process_all_xmls_bs4(folder)



In [None]:
zip_output_path = 'result.zip'
with zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(output_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, output_dir)
            zipf.write(file_path, arcname)