In [4]:
import re
import json
import pandas as pd

def parse_document(text):
    result = {}
    current_section = ""
    current_subsection = ""

    lines = text.splitlines()
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check for section number (e.g., "1. ประเภทรถ")
        section_match = re.match(r'^(\d+)\.\s*(.+)', line)
        subsection_match = re.match(r'^(\d+\.\d+)\s*(.+)', line)

        if section_match:
            section_num = section_match.group(1)
            title = section_match.group(2).strip()
            current_section = f"{section_num}. {title}"
            result[current_section] = []
        elif subsection_match:
            sub_num = subsection_match.group(1)
            content = subsection_match.group(2).strip()
            if current_section:
                current_subsection = f"{sub_num} {content}"
                result[current_section].append({
                    "sub": current_subsection,
                    "values": []
                })
        else:
            # Split by tab characters or double spaces
            cols = re.split(r'\t+|\s{2,}', line)
            if current_subsection:
                # Add to last subsection
                if result[current_section]:
                    result[current_section][-1]["values"].append(cols)
            elif current_section:
                # Add to section if no subsection
                result[current_section].append(cols)

    return result

# Read data from the Excel file
file_path = "Data real\\Credit Policy - CTVGMHL.xlsx"
excel_data = pd.read_excel(file_path, sheet_name=None)  # Read all sheets

# Combine all sheets into a single string (adjust this logic as per your needs)
text_data = ""
for sheet_name, df in excel_data.items():
    text_data += df.to_string(index=False, header=False) + "\n"  # Convert each sheet to text

# Process the text data
structured_data = parse_document(text_data)

# Save to JSON
with open("structured_loan_data.json", "w", encoding="utf-8") as f:
    json.dump(structured_data, f, ensure_ascii=False, indent=2)
