In [None]:
import os
import zipfile
from bs4 import BeautifulSoup

def clean_html_text(raw_html):
    """ Extract and sanitize text content found between HTML tags using BeautifulSoup. """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        # Retrieve plain text from parsed HTML, separating elements with spaces
        return soup.get_text(separator=' ', strip=True)
    except Exception as error:
        return f"HTML parsing failed: {str(error)}"

def retrieve_and_clean_text(block):
    """ Extracts the portion within <TEXT>...</TEXT> tags from an HTML block. """
    open_tag = '<TEXT>'
    close_tag = '</TEXT>'
    try:
        open_idx = block.index(open_tag) + len(open_tag)
        close_idx = block.index(close_tag)
        inner_html = block[open_idx:close_idx]
        return clean_html_text(inner_html)
    except ValueError:
        return "Unable to locate <TEXT> tags"

def collect_and_consolidate_reports(zip_archive, output_text_file):
    """ Extracts all relevant text blocks from the zip file and consolidates them into a single text file. """
    with zipfile.ZipFile(zip_archive, 'r') as archive:
        # List all folders in the archive and filter out those following the desired structure
        all_folders = [item for item in archive.namelist() if item.endswith('/')]
        target_folders = [folder for folder in all_folders if folder.startswith('AMZN/10-K/') and folder.count('/') == 3]

        # Write the extracted and formatted content into a specified text file
        with open(output_text_file, 'w') as output:
            for folder in target_folders:
                file_name = os.path.join(folder, 'full-submission.txt')
                if file_name in archive.namelist():
                    with archive.open(file_name) as submission:
                        raw_content = submission.read().decode('utf-8')
                        cleaned_content = retrieve_and_clean_text(raw_content)
                        folder_id = folder.strip('/')
                        output.write(f'### {folder_id} ###\n{cleaned_content}\n\n')

# Usage
zip_archive = '/content/filings.zip'
output_text_file = 'consolidated_html_reports.txt'
collect_and_consolidate_reports(zip_archive, output_text_file)
