# Working With Doc

## 1.Module Import

In [2]:
from docx import Document
import docx2txt
import os
from pathlib import Path

## 2.Handling Data

## Task 1

In [3]:
doc_path = '../data_import/docx_mock_file.docx'
target_path = Path('../data_export/file_docx')

os.makedirs(target_path, exist_ok=True)

#### Extract Text

In [4]:
target_text_path = target_path / 'text'

def extract_text(doc_path):
    doc = Document(doc_path)
    text = []
    for para in doc.paragraphs:
        text.append(para.text)
    return '\n'.join(text)

text = extract_text(doc_path)
text_output_path = target_text_path / 'extracted_text.txt'
with open(text_output_path, 'w', encoding='utf-8') as f:
    f.write(text)

print(f"Text has been extracted to {text_output_path}")

Text has been extracted to ../data_export/file_docx/text/extracted_text.txt


In [5]:
target_text_path = target_path / 'text'

def extract_text_and_tables(doc_path):
    doc = Document(doc_path)
    text = []
    
    for para in doc.paragraphs:
        text.append(para.text)
    
    for table in doc.tables:
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text)
            text.append('\t'.join(row_data))
    
    return '\n'.join(text)

text = extract_text_and_tables(doc_path)
text_output_path = target_text_path / 'extracted_text_and_tables.txt'
with open(text_output_path, 'w', encoding='utf-8') as f:
    f.write(text)

print(f"Text and tables have been extracted to {text_output_path}")

Text and tables have been extracted to ../data_export/file_docx/text/extracted_text_and_tables.txt
