# English Correction with Langchain

# Overview

## Step 0: Configuring the Environment 

In [None]:
# Required Libraries
!pip install -r ../requirements.txt --quiet

In [None]:
# Standard Libraries
import os
import sys
from collections import defaultdict

# Add src directory to system path
sys.path.append(os.path.abspath('../src'))

# Internal Modules
from github_extractor import GitHubMarkdownProcessor
from utils import load_config_and_secrets
from utils import (
    load_config_and_secrets,
    initialize_llm,
)

### Define Constants and Paths

In [None]:
CONFIG_PATH = "../configs/configs.yaml"
SECRETS_PATH = "../configs/secrets.yaml"
LOCAL_MODEL_PATH = "/home/jovyan/datafabric/llama2-7b/ggml-model-f16-Q5_K_M.gguf"

### Configuration and Secrets Loading

In [None]:
config, secrets = load_config_and_secrets(CONFIG_PATH, SECRETS_PATH)

## Step 1: Extracting and Parsing Markdown Files From GitHub Repositories

In [None]:
# Repo URL and token
repo_url = "https://github.com/hp-david/test/tree/main"
access_token = secrets.get("GITHUB_ACCESS_TOKEN")

# Create processor instance
processor = GitHubMarkdownProcessor(repo_url=repo_url, access_token=access_token)

# Run preprocessing workflow
chunks = processor.run()

## Step 2: Correct Markdown Files with LLM

In [None]:
from core.prompt_templates import get_markdown_correction_prompt

# Get markdown correction prompt from prompt_templates module
correction_prompt = get_markdown_correction_prompt()

In [None]:
if "model_source" in config:
    model_source = config["model_source"]

# Initialize llm 
llm = initialize_llm(model_source, secrets, LOCAL_MODEL_PATH)

# Create the LLM chain with the correction prompt
llm_chain = correction_prompt | llm

In [None]:
results = []

# Process each markdown chunk through the language model and store the results
for file_path, file_chunks in chunks.items():
    for idx, chunk in enumerate(file_chunks):
        response = llm_chain.invoke({"markdown": chunk})
        results.append({
            "file": file_path,
            "chunk_index": idx,
            "original": chunk,
            "corrected": response
        })

In [None]:
# Print results during testing
print(results)

In [None]:
# Group corrected chunks by filename
grouped_chunks = defaultdict(list)

for item in results:
    grouped_chunks[item["file"]].append((item["chunk_index"], item["corrected"]))

# Write one merged corrected markdown file per original file
for file_path, chunks in grouped_chunks.items():
    # Sort chunks by their original index to preserve order
    sorted_chunks = [chunk for idx, chunk in sorted(chunks)]
    merged_markdown = "\n\n".join(sorted_chunks)

    output_filename = file_path.replace(".md", "_corrected.md")
    output_path = os.path.join("corrected_output", output_filename)

    os.makedirs("corrected_output", exist_ok=True)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(merged_markdown)