<h1 style="text-align: center; font-size: 50px;"> Run Workflow </h1>

# Notebook Overview
- Configure the Environment
- Define Constants and Paths
- Load Configuratons and Secrets
- Extract Markdown Files with Placeholders
- Parse Markdown Files
- Chunk Markdown Content
- Initialize Model
- Invoke Model on Each Chunk
- Save Results
- Log Execution Time

## Step 0: Configure the Environment

In [1]:
import logging
import time

# Configure logger
logger: logging.Logger = logging.getLogger("run_workflow_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  

logger.info("Notebook execution started.")

2025-08-08 08:47:35 - INFO - Notebook execution started.


In [3]:
%%time

# Required Libraries
%pip install -r ../requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.
CPU times: user 58 ms, sys: 31.6 ms, total: 89.5 ms
Wall time: 2.27 s


In [4]:
# Standard Libraries
import os
import sys
import difflib
import json
import re
import yaml
from datetime import datetime
from collections import defaultdict
from typing import List
from pathlib import Path

# Add src directory to system path
sys.path.append(str(Path("..").resolve() / "src"))

# Internal Modules
from github_extractor import GitHubMarkdownProcessor
from utils import load_config_and_secrets, initialize_llm
from parser import parse_md_for_grammar_correction, restore_placeholders
from chunker import chunk_markdown
from prompt_templates import get_markdown_correction_prompt
from markdown_correction_service import MarkdownCorrectionService

# Other modules
import mlflow
from mlflow.models import evaluate

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Define Constants and Paths

In [6]:
CONFIG_PATH = Path("../configs/configs.yaml")
SECRETS_PATH = Path("../configs/secrets.yaml")
LOCAL_MODEL_PATH = Path("/home/jovyan/datafabric/meta-llama3.1-8b-Q8/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf")

### Load Configurations and Secrets

In [7]:
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load secrets 
github_token = os.getenv("AIS_GITHUB_ACCESS_TOKEN")
if github_token:
    secrets = {"AIS_GITHUB_ACCESS_TOKEN": github_token}
    logger.info("Loaded GITHUB_ACCESS_TOKEN from environment variable.")
else:
    secrets_path = SECRETS_PATH
    if os.path.exists(secrets_path):
        with open(secrets_path, "r") as f:
            secrets = yaml.safe_load(f)
        logger.info(f"Loaded secrets from {secrets_path}.")
    else:
        # If no token is found anywhere, initialize with an empty dict or handle as an error
        secrets = {}
        logger.warning("No GITHUB_ACCESS_TOKEN found in environment or secrets.yaml.")



## Step 1: Extracting and Parsing Markdown Files From GitHub Repositories

### Extract Markdown Files

In [8]:
# Define repo URL and GitHub access token
repo_url = "https://github.com/hp-david/test/tree/main" #"https://github.com/HPInc/AI-Blueprints" 
access_token = secrets.get("AIS_GITHUB_ACCESS_TOKEN")

# Create processor instance
processor = GitHubMarkdownProcessor(repo_url=repo_url, access_token=access_token)

# Run preprocessing workflow
markdowns = processor.run()

2025-08-08 08:47:43 - INFO - Repository visibility: public
2025-08-08 08:47:43 - INFO - Repository visibility: public
2025-08-08 08:47:45 - INFO - Raw markdown extraction complete.
2025-08-08 08:47:45 - INFO - Raw markdown extraction complete.


### Parse Markdown Files with Placeholders

In [9]:
parsed_markdowns = {}
placeholder_maps = {}

for filename, content in markdowns.items():
    # Parse the content and get placeholder map
    placeholder_map, processed_content = parse_md_for_grammar_correction(content)
    
    # Store the processed content (maintains dictionary structure for chunker)
    parsed_markdowns[filename] = processed_content
    
    # Store the placeholder map for restoration
    placeholder_maps[filename] = placeholder_map

logger.info(f"Parsed {len(parsed_markdowns)} files successfully")

2025-08-08 08:47:45 - INFO - Parsed 5 files successfully
2025-08-08 08:47:45 - INFO - Parsed 5 files successfully


### Chunk Markdown Content

In [10]:
all_chunks = {}  

# Chunk each file's content and store the results in a dictionary
for file_name, content in parsed_markdowns.items():
    chunks = chunk_markdown(content)
    all_chunks[file_name] = chunks

### Display Chunks (Optional)

In [11]:
'''
for file_name, chunks in all_chunks.items():
    logger.info(f"\n===== {file_name} =====\n")
    for i, chunk in enumerate(chunks):
        logger.info(f"\n--- Chunk {i+1} ---\n")
        logger.info(chunk)
        logger.info("\n" + "-" * 40 + "\n")
'''

'\nfor file_name, chunks in all_chunks.items():\n    logger.info(f"\n===== {file_name} =====\n")\n    for i, chunk in enumerate(chunks):\n        logger.info(f"\n--- Chunk {i+1} ---\n")\n        logger.info(chunk)\n        logger.info("\n" + "-" * 40 + "\n")\n'

## Step 2: Correct Markdown Files with LLM

In [12]:
# Get markdown correction prompt from prompt_templates module
correction_prompt = get_markdown_correction_prompt()

### Initialize Mode

In [13]:
%%time

# Set the model source from the configs
if "model_source" in config:
    model_source = config["model_source"]

# Initialize llm 
llm = initialize_llm(model_source, secrets, str(LOCAL_MODEL_PATH))

# Create the LLM chain with the correction prompt
llm_chain = correction_prompt | llm

                low_vram was transferred to model_kwargs.
                Please confirm that low_vram is what you intended.
  exec(code, glob, local_ns)
                rope_scaling was transferred to model_kwargs.
                Please confirm that rope_scaling is what you intended.
  exec(code, glob, local_ns)
                num_threads was transferred to model_kwargs.
                Please confirm that num_threads is what you intended.
  exec(code, glob, local_ns)


CPU times: user 3.44 s, sys: 3.65 s, total: 7.09 s
Wall time: 1min 45s


### Invoke Model on Each Chunk

In [14]:
%%time

results = []
count = 0

# Process each chunk through the language model and store the results
for file_name, chunks in all_chunks.items():  
    for chunk in chunks:
        # Send the chunks to the llm for correction
        response = llm_chain.invoke({"markdown": chunk})

        # Store the file name, original text, and corrected text
        results.append({
            "file": file_name,
            "original": chunk,
            "corrected": response
        })

        # Log progress (optional)
        logger.info(f"chunk {count} done")
        count += 1

2025-08-08 08:49:33 - INFO - chunk 0 done
2025-08-08 08:49:33 - INFO - chunk 0 done
2025-08-08 08:49:36 - INFO - chunk 1 done
2025-08-08 08:49:36 - INFO - chunk 1 done
2025-08-08 08:49:38 - INFO - chunk 2 done
2025-08-08 08:49:38 - INFO - chunk 2 done
2025-08-08 08:49:42 - INFO - chunk 3 done
2025-08-08 08:49:42 - INFO - chunk 3 done
2025-08-08 08:49:46 - INFO - chunk 4 done
2025-08-08 08:49:46 - INFO - chunk 4 done
2025-08-08 08:49:48 - INFO - chunk 5 done
2025-08-08 08:49:48 - INFO - chunk 5 done
2025-08-08 08:49:50 - INFO - chunk 6 done
2025-08-08 08:49:50 - INFO - chunk 6 done
2025-08-08 08:49:57 - INFO - chunk 7 done
2025-08-08 08:49:57 - INFO - chunk 7 done
2025-08-08 08:50:01 - INFO - chunk 8 done
2025-08-08 08:50:01 - INFO - chunk 8 done
2025-08-08 08:50:17 - INFO - chunk 9 done
2025-08-08 08:50:17 - INFO - chunk 9 done
2025-08-08 08:50:19 - INFO - chunk 10 done
2025-08-08 08:50:19 - INFO - chunk 10 done
2025-08-08 08:50:21 - INFO - chunk 11 done
2025-08-08 08:50:21 - INFO - ch

CPU times: user 2min 54s, sys: 1.35 s, total: 2min 56s
Wall time: 2min 56s


### Display Corrected Chunks (Optional)

In [15]:
'''
for result in results:
    original_text = result["original"]
    corrected_text = result["corrected"]
    
    original_tokens = len(llm.client.tokenize(original_text.encode("utf-8")))
    corrected_tokens = len(llm.client.tokenize(corrected_text.encode("utf-8")))

    logger.info(f"\n===== {result['file']} =====\n")
    logger.info(f"--- Original ({original_tokens} tokens) ---\n")
    logger.info(original_text)
    logger.info(f"\n--- Corrected ({corrected_tokens} tokens) ---\n")
    logger.info(corrected_text)
    logger.info("\n" + "=" * 60 + "\n")
'''

'\nfor result in results:\n    original_text = result["original"]\n    corrected_text = result["corrected"]\n\n    original_tokens = len(llm.client.tokenize(original_text.encode("utf-8")))\n    corrected_tokens = len(llm.client.tokenize(corrected_text.encode("utf-8")))\n\n    logger.info(f"\n===== {result[\'file\']} =====\n")\n    logger.info(f"--- Original ({original_tokens} tokens) ---\n")\n    logger.info(original_text)\n    logger.info(f"\n--- Corrected ({corrected_tokens} tokens) ---\n")\n    logger.info(corrected_text)\n    logger.info("\n" + "=" * 60 + "\n")\n'

### Save Results

#### Save Raw Corrrected Markdowns

In [16]:
# Helper: Safe chunk joiner.
def safe_join_chunks(chunks: List[str]) -> str:
    """Rejoins a list of text chunks into a single string, preserving formatting and sentence boundaries.

    Ensures that chunks split mid-sentence get a space inserted appropriately.

    Args:
        chunks (List[str]): A list of processed text segments.

    Returns:
        str: The reassembled markdown text.
    """
    joined = ""
    for i, chunk in enumerate(chunks):
        if i == 0:
            joined += chunk
        else:
            prev = chunks[i - 1].rstrip()
            curr = chunk

            # Heuristic: Detect if a sentence was split across two chunks.
            if prev.endswith('.') and re.match(r'^[A-Z\"]', curr.lstrip()):
                # If it's a sentence break, add a single space to separate them.
                joined += ' ' + curr.lstrip()
            else:
                # Otherwise, join the chunk directly.
                joined += curr  
    return joined


# Group corrected chunks by file
corrected_chunks_by_file = defaultdict(list)
for result in results:
    corrected_chunks_by_file[result["file"]].append(result["corrected"])

# Rebuild each file from its corrected chunks with smart joining
rebuilt_corrected_files = {
    file_name: safe_join_chunks(chunks)
    for file_name, chunks in corrected_chunks_by_file.items()
}

# Create output directory
output_dir = Path("corrected")
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Restore placeholders and write final output
for file_name, corrected_content in rebuilt_corrected_files.items():
    placeholder_map = placeholder_maps.get(file_name, {})
    restored_content = restore_placeholders(corrected_content, placeholder_map)

    file_path = Path(file_name)

    # Construct the output path while preserving the original directory structure.
    output_path = output_dir / file_path.parent / f"{file_path.stem}_{timestamp}{file_path.suffix}"

    # This line is now crucial for creating the subdirectories.
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(restored_content)

#### Save Corrected Markdowns with Diff

In [17]:
# Cell 2 (Corrected)

diff_output_dir = Path("corrected-diffs")
diff_output_dir.mkdir(parents=True, exist_ok=True)

for file_name, corrected_content in rebuilt_corrected_files.items():
    placeholder_map = placeholder_maps.get(file_name, {})
    restored_content = restore_placeholders(corrected_content, placeholder_map)

    # Get original content from markdowns dict
    original_content = markdowns.get(file_name)
    if original_content is None:
        logger.info(f"Warning: No original content for file {file_name}")
        continue

    # Create unified diff view (HTML side-by-side)
    differ = difflib.HtmlDiff(tabsize=4, wrapcolumn=80)
    diff_html = differ.make_file(
        original_content.splitlines(),
        restored_content.splitlines(),
        fromdesc=f"Original: {file_name}",
        todesc=f"Corrected: {file_name}",
        context=True,
        numlines=3
    )

    file_path = Path(file_name)

    # Construct the diff path while preserving the original directory structure.
    diff_path = diff_output_dir / file_path.parent / f"{file_path.stem}_{timestamp}{file_path.suffix}.html"
    
    # Ensure the subdirectories exist in the diffs folder too.
    diff_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(diff_path, "w", encoding="utf-8") as f:
        f.write(diff_html)

#### Save Chunks Into a JSON for Evaluation

In [18]:
results_path = Path("results.json")
with results_path.open("w", encoding="utf-8") as f:
    json.dump(results, f)

### Log Execution Time

In [19]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"⏱️ Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("✅ Notebook execution completed successfully.")

2025-08-08 08:52:27 - INFO - ⏱️ Total execution time: 4m 52.07s
2025-08-08 08:52:27 - INFO - ⏱️ Total execution time: 4m 52.07s
2025-08-08 08:52:27 - INFO - ✅ Notebook execution completed successfully.
2025-08-08 08:52:27 - INFO - ✅ Notebook execution completed successfully.
