
# Academy data pipeline -- Data extraction from text to CSV

<h3>Extracting data from modules text and save it in a dataframe: Approach </h3>
<ol type="1">
  <li> Reading the Document </li>
    <li> Structure the Data: Parsing the data following the hierarchical structure of the doc by parsing categorically ( sections, slides, and media) keeping the text style </li>
    <li> Create DataFrame:Convert this structured data into a pandas DataFrame </li>
    
</ol>

In [2]:
# Import libraries 
from docx import Document 
import pandas as pd
import os
import re

Read the doc

In [2]:
# read the test .docx file
docx_path = 'Modules/sea-test.docx'

# load the document
document = Document(docx_path)

# extract text from the document
doc_text = []
for para in document.paragraphs:
    doc_text.append(para.text)

# print the text
print('\n'.join(doc_text))

#5.3.2 Energy Transition

*Slide* [Innovation in the Just Energy Transition]

**Photo**
img=access-1

**Text-narrow**

Energy lorem ipsum alo vil cu li li oppo energy sustainable etc Energy lorem ipsum alo vil cu li li oppo energy sustainable etc Energy lorem ipsum alo vil cu li li oppo energy sustainable 


**Text-wide**
Energy lorem ipsum alo vil cu li li oppo 
Energy lorem ipsum alo vil cu li li oppo energy sustainable etc
Energy lorem ipsum alo vil cu li li oppo energy sustainable etc

*Slide* [Energy Access]

**Photo-Box**

img=access-3

**Text**

Energy lorem ipsum alo vil 

**Diagram**

dgm=transition-2

Here the diagram shows that renewable energy is good.


**Text-dark-6**

Energy lorem ipsum alo vil cu li li oppo energy sustainable etc
Energy lorem ipsum alo vil cu li li oppo energy sustainable etc
Energy lorem ipsum alo vil cu li li oppo energy sustainable etc


**Text-dark-4**

Energy lorem ipsum alo vil cu li li oppo energy sustainable etc Energy lorem ipsum alo vil cu li 

### Parsing

In [23]:
def extract_text_with_styles(doc):
    """Extracts text from a DOCX document, preserving some style elements, including text size."""
    text = []
    for para in doc.paragraphs:
        para_text = ""
        for run in para.runs:
            style_prefix = ""
            style_suffix = ""
            # Basic styles
            if run.bold:
                style_prefix += "<b>"
                style_suffix = "</b>" + style_suffix
            if run.italic:
                style_prefix += "<i>"
                style_suffix = "</i>" + style_suffix
            if run.underline:
                style_prefix += "<u>"
                style_suffix = "</u>" + style_suffix
            # Text color
            if run.font.color.rgb:
                color = run.font.color.rgb  # Color is an RGBColor object
                style_prefix += f'<span style="color:#{color}">'
                style_suffix = "</span>" + style_suffix
            # Text size
            if run.font.size:  # Font size is reported in Pt
                size_pt = run.font.size.pt
                if size_pt > 12:  # Assuming default text size is 12pt, customize as needed
                    style_prefix += f'<span style="font-size:{size_pt}pt;">'
                    style_suffix = "</span>" + style_suffix

            para_text += f"{style_prefix}{run.text}{style_suffix}"

        # Heading handling
        if para.style.name.startswith('Heading'):
            level = para.style.name.split()[-1]
            text.append(f'<h{level}>{para_text}</h{level}>')
        else:
            text.append(para_text)
    return '\n'.join(text)


def parse_document(document_text):
    # Initialize variables
    data = []
    section_num, section_title, slide_title = '', '', ''
    slide_number = 0  # Slide numbering starts at 0 and increments each time a new slide is found
  

    lines = document_text.split('\n')
    for line in lines:
        # Handle section headers
        section_header_match = re.match(r'#\s*(\d+\.\d+\.\d+)\s+(.*)', line)
        if section_header_match:
            section_num = section_header_match.group(1)
            section_title = section_header_match.group(2)
            slide_number = 0  # Reset slide number for each new section
            continue  # Skip adding a row for the section header

        if '*Slide*' in line:
            # Increment slide number for each new slide
            slide_number += 1
            slide_title = re.findall(r'\[([^\]]+)\]', line)[0]
            continue  # Skip adding a row for the slide title

        # Determine type based on content
        if 'img=' in line or 'dgm=' in line:
            code_type, code = line.split('=')
            row_type = 'Photo' if 'img' in code_type else 'Diagram'
            data.append({
                'Type': row_type,
                'Code': code.strip(),
                'Section Number': section_num,
                'Section Title': section_title,
                'Slide Number': slide_number,
                'Slide Title': slide_title,
                'Text': '',
                'Order': order
            })
            order += 1
        else:
            # Add regular text content
            data.append({
                'Type': 'Text',
                'Code': '',
                'Section Number': section_num,
                'Section Title': section_title,
                'Slide Number': slide_number,
                'Slide Title': slide_title,
                'Text': line,
                'Order': order
            })
            order += 1

    return pd.DataFrame(data)

# Example of reading and processing a DOCX file
doc_path = '../03_Inputs/Modules/sea-test.docx'  # Adjust this path if necessary
doc = Document(doc_path)
document_text = extract_text_with_styles(doc)
df = parse_document(document_text)
df


Unnamed: 0,Type,Code,Section Number,Section Title,Slide Number,Slide Title,Text,Order
0,Text,,5.3.2,Energy Transition,0,,,1
1,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,2
2,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,3
3,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,**Photo**,4
4,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,5
5,Photo,access-1,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,6
6,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,7
7,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,**Text-narrow**,8
8,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,,9
9,Text,,5.3.2,Energy Transition,1,Innovation in the Just Energy Transition,Energy lorem ipsum alo vil cu li li oppo energ...,10
