In [1]:
import re
import os


In [2]:
#Define Basic Cleaning Functions

def remove_span_tags(text):
    """
    Remove all standalone span tags of the form:
       <span id="page-XX-XX"></span>
    """
    cleaned = re.sub(r'<span\s+id="page-[^"]+"\s*>\s*</span>', '', text)
    return cleaned

def remove_inline_anchors(text):
    """
    Remove inline anchor parts like:  (#page-48-0)
    """
    cleaned = re.sub(r'\s*\(#page-[^)]+\)', '', text)
    return cleaned




In [3]:
#  new Process Image Captions

def process_image_captions(lines):
    """
    Process image markdown lines.
    For an image markdown line, look ahead for a caption line (optionally with a span tag)
    that starts with "Figure". If found, insert the complete caption (e.g., 
    "Figure 4.10 : WAT vs COMPASS hours") into the alt text as IMG_TITLE. Then, skip the caption line.
    """
    new_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]
        img_match = re.match(r'^!\[(.*?)\]\((.*?)\)', line)
        if img_match:
            alt_text, img_path = img_match.groups()
            # Look ahead for the next nonempty line.
            j = i + 1
            while j < len(lines) and lines[j].strip() == "":
                j += 1
            if j < len(lines):
                next_line = lines[j].strip()
                # Optional span tag then caption text starting with "Figure"
                caption_match = re.match(r'^(?:<span\s+id="page-[^"]+"\s*>\s*)?(Figure.*)', next_line, re.IGNORECASE)
                if caption_match:
                    caption_text = caption_match.group(1).strip()
                    new_alt = f"IMG_TITLE: {caption_text}"
                    new_line = f'![{new_alt}]({img_path})'
                    new_lines.append(new_line)
                    i = j + 1  # Skip the caption line.
                    continue
        new_lines.append(line)
        i += 1
    return new_lines


In [4]:
# Process Table Titles and Column Headers

def process_table_titles(lines):
    """
    Process table blocks to extract each table's title and column headers.
    
    -- Case 1 (Single table):
         The header row starts with "| Column:".
         - Extract the table title by removing "Column:" from the first cell.
         - Assume the third row holds the column headers.
         - Insert marker lines:
              TABLE_TITLE: <table title>
              TABLE_COLUMNS: <col1>, <col2>, ...
    
    -- Case 2 (Side-by-side tables):
         The header row does not contain "Column:" but appears to include multiple segments separated by a double-pipe.
         - Split the header row into parts.
         - For each part, extract the table title (the first cell).
         - Similarly, split the third row to extract the column headers for each sub-table.
         - For each sub-table, insert marker lines:
              TABLE_TITLE: <title>
              TABLE_COLUMNS: <col1>, <col2>, ...
    """
    new_lines = []
    i = 0
    while i < len(lines):
        if not lines[i].strip().startswith("|"):
            new_lines.append(lines[i])
            i += 1
        else:
            # Gather the entire table block.
            table_block = []
            while i < len(lines) and lines[i].strip().startswith("|"):
                table_block.append(lines[i])
                i += 1
            
            marker_lines = []
            header_line = table_block[0].strip()
            
            if "Column:" in header_line:
                # ----- Case 1: Single table -----
                inner = header_line.strip("|")
                cells = [c.strip() for c in inner.split("|")]
                table_title = ""
                if cells and cells[0].startswith("Column:"):
                    table_title = cells[0].replace("Column:", "").strip()
                # Assume the third row stores the column headers.
                if len(table_block) >= 3:
                    col_header_line = table_block[2].strip().strip("|")
                    col_cells = [c.strip() for c in col_header_line.split("|") if c.strip()]
                else:
                    col_cells = []
                if table_title:
                    marker_lines.append(f"TABLE_TITLE: {table_title}")
                if col_cells:
                    marker_lines.append("TABLE_COLUMNS: " + ", ".join(col_cells))
            else:
                # ----- Case 2: Side-by-side tables -----
                if re.search(r'\|\s*\|', header_line):
                    # Split on a double-pipe pattern.
                    parts = re.split(r'\|\s*\|', header_line)
                    parts = [p.strip() for p in parts if p.strip()]
                    if len(table_block) >= 3:
                        col_header_line = table_block[2].strip()
                        col_parts = re.split(r'\|\s*\|', col_header_line)
                        col_parts = [p.strip() for p in col_parts if p.strip()]
                    else:
                        col_parts = []
                    
                    for idx, part in enumerate(parts):
                        sub_cells = [c.strip() for c in part.strip("|").split("|") if c.strip()]
                        table_title = sub_cells[0] if sub_cells else ""
                        if idx < len(col_parts):
                            # Assume that the columns of this sub-table are space separated.
                            col_cells = [x.strip() for x in col_parts[idx].split() if x.strip()]
                        else:
                            col_cells = []
                        if table_title:
                            marker_lines.append(f"TABLE_TITLE: {table_title}")
                        if col_cells:
                            marker_lines.append("TABLE_COLUMNS: " + ", ".join(col_cells))
                # If the header line does not match any expected pattern, leave marker_lines empty.
            
            new_lines.extend(marker_lines)
            new_lines.extend(table_block)
    return new_lines




In [5]:
# Combine Preprocessing Steps

def preprocess_markdown(md_text):
    """
    Preprocess the markdown text by:
      1. Removing stray span tags.
      2. Removing inline anchor links.
      3. Processing image markdown lines by merging captions into alt text.
      4. Processing table blocks to insert TABLE_TITLE and TABLE_COLUMNS markers.
      5. (Optionally) Processing table subtitles.
    Returns the fully cleaned markdown text.
    """
    # Remove stray span tags.
    cleaned = remove_span_tags(md_text)
    # Remove inline anchors.
    cleaned = remove_inline_anchors(cleaned)
    
    # Split text into lines.
    lines = cleaned.splitlines()
    
    # Process images (merge their captions into alt text).
    lines = process_image_captions(lines)
    
    # Process table blocks to add title and column header markers.
    lines = process_table_titles(lines)
    
    
    # Reassemble the text.
    final_text = "\n".join(lines)
    return final_text




In [6]:
# Specify the path to your original markdown file.
input_file_path = r'D:\ML\thesis_chatbot\Data\out\Cost_Estimation_with_Machine_learning_Piyush_Chikhale\Cost_Estimation_with_Machine_learning_Piyush_Chikhale.md'

# Read the markdown file.
with open(input_file_path, 'r', encoding='utf-8') as f:
    original_md = f.read()

# Preprocess the markdown.
processed_md = preprocess_markdown(original_md)

# Write the processed markdown to "preprocessed_md.md" (in the current working directory)
output_file_path = r"D:\ML\thesis_chatbot\Data\out\preprocessed_md.md"
with open(output_file_path, 'w', encoding='utf-8') as f_out:
    f_out.write(processed_md)

print(f"Preprocessing complete. Output written to {output_file_path}")


Preprocessing complete. Output written to D:\ML\thesis_chatbot\Data\out\preprocessed_md.md
