# Working With Doc

## 1.Module Import

In [35]:
from docx import Document
import os
from pathlib import Path
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.shared import RGBColor
import json


## 2.Handling Data

## Task 1

In [36]:
doc_path = Path('../data_import/docx_mock_file.docx')
target_path = Path('../data_export/file_docx')

target_text_path = target_path / 'text'
target_paragraph_details_path = target_path / 'paragraph_details'
target_convert_uppercase_path = target_path / 'convert_uppercase' 

os.makedirs(target_path, exist_ok=True)
os.makedirs(target_paragraph_details_path, exist_ok=True)
os.makedirs(target_convert_uppercase_path, exist_ok=True)

#### Extract Text

In [37]:
target_text_path = target_path / 'text'

def extract_text_and_tables(doc_path):
    doc = Document(doc_path)
    text = []
    
    for para in doc.paragraphs:
        text.append(para.text)
    
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text)
            text.append('\t'.join(row_data))
    
    return '\n'.join(text)

text = extract_text_and_tables(doc_path)
text_output_path = target_text_path / 'extracted_text_and_tables.txt'
with open(text_output_path, 'w', encoding='utf-8') as f:
    f.write(text)

print(f"Text and tables have been extracted to {text_output_path}")

Text and tables have been extracted to ../data_export/file_docx/text/extracted_text_and_tables.txt


### Extract paragraph details

In [38]:
def rgb_to_hex(rgb):
    return '{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2])

def extract_paragraph_details(doc_path):
    doc = Document(doc_path)
    paragraph_details = []

    for para in doc.paragraphs:
        runs = para.runs
        para_text = para.text
        para_details = {
            "text": para_text,
            "runs": []
        }

        for run in runs:
            font = run.font
            if font.color and font.color.rgb:
                color = font.color.rgb
                color_hex = rgb_to_hex(color)
            else:
                color_hex = "000000" 

            run_details = {
                "text": run.text,
                "font_name": font.name,
                "font_size": font.size.pt if font.size else None,
                "bold": font.bold,
                "italic": font.italic,
                "underline": font.underline,
                "color": color_hex
            }
            para_details["runs"].append(run_details)

        paragraph_details.append(para_details)

    return paragraph_details

def save_paragraph_details(paragraph_details, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(paragraph_details, f, ensure_ascii=False, indent=4)

paragraph_details_output_path = target_paragraph_details_path / 'paragraph_details.json'
paragraph_details = extract_paragraph_details(doc_path)
save_paragraph_details(paragraph_details, paragraph_details_output_path)
print(f"Paragraph details have been extracted to {paragraph_details_output_path}")

Paragraph details have been extracted to ../data_export/file_docx/paragraph_details/paragraph_details.json


In [39]:
def rgb_to_hex(rgb):
    return '{:02x}{:02x}{:02x}'.format(rgb[0], rgb[1], rgb[2])

def extract_paragraph_details(doc_path):
    doc = Document(doc_path)
    details = []

    for para in doc.paragraphs:
        details.extend(extract_run_details_from_element(para.runs, para.text))

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    details.extend(extract_run_details_from_element(para.runs, para.text))

    return details

def extract_run_details_from_element(runs, text):
    element_details = []

    element_details.append({
        "text": text,
        "runs": []
    })

    for run in runs:
        font = run.font
        if font.color and font.color.rgb:
            color = font.color.rgb
            color_hex = rgb_to_hex(color)
        else:
            color_hex = "000000" 

        run_details = {
            "text": run.text,
            "font_name": font.name,
            "font_size": font.size.pt if font.size else None,
            "bold": font.bold,
            "italic": font.italic,
            "underline": font.underline,
            "color": color_hex
        }
        element_details[-1]["runs"].append(run_details)

    return element_details

def save_paragraph_details(paragraph_details, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(paragraph_details, f, ensure_ascii=False, indent=4)

paragraph_details_output_path = target_paragraph_details_path / 'paragraph_details.json'
paragraph_details = extract_paragraph_details(doc_path)
save_paragraph_details(paragraph_details, paragraph_details_output_path)
print(f"Paragraph details have been extracted to {paragraph_details_output_path}")

Paragraph details have been extracted to ../data_export/file_docx/paragraph_details/paragraph_details.json


## Convert To Uppercase

In [40]:
output_doc_path = target_convert_uppercase_path / 'converted_docx_mock_file.docx'

def create_uppercase_doc(doc_path, output_doc_path):
    doc = Document(doc_path)
    new_doc = Document()

    def copy_run_formatting(src_run, dst_run):
        dst_run.bold = src_run.bold
        dst_run.italic = src_run.italic
        dst_run.underline = src_run.underline
        if src_run.font.size:
            dst_run.font.size = src_run.font.size
        if src_run.font.name:
            dst_run.font.name = src_run.font.name
        if src_run.font.color and src_run.font.color.rgb:
            dst_run.font.color.rgb = src_run.font.color.rgb

    for para in doc.paragraphs:
        new_para = new_doc.add_paragraph()
        for run in para.runs:
            new_run = new_para.add_run(run.text.upper())
            copy_run_formatting(run, new_run)

    for table in doc.tables:
        new_table = new_doc.add_table(rows=len(table.rows), cols=len(table.columns))
        new_table.style = table.style
        for row_idx, row in enumerate(table.rows):
            for cell_idx, cell in enumerate(row.cells):
                new_cell = new_table.cell(row_idx, cell_idx)
                for para in cell.paragraphs:
                    new_para = new_cell.add_paragraph()
                    for run in para.runs:
                        new_run = new_para.add_run(run.text.upper())
                        copy_run_formatting(run, new_run)

    new_doc.save(output_doc_path)

create_uppercase_doc(doc_path, output_doc_path)
print(f"The document with uppercase text has been created at {output_doc_path}")

The document with uppercase text has been created at ../data_export/file_docx/convert_uppercase/converted_docx_mock_file.docx
