# English Correction with Langchain

# Overview

## Step 0: Configuring the Environment 

In [None]:
# Required Libraries
!pip install -r ../requirements.txt --quiet

In [None]:
# Standard Libraries
import os
import sys
import logging
from collections import defaultdict

# Add src directory to system path
sys.path.append(os.path.abspath('../src'))

# Internal Modules
from github_extractor import GitHubMarkdownProcessor
from utils import load_config_and_secrets
from utils import (
    load_config_and_secrets,
    initialize_llm,
)
from parser import parse_md_for_grammar_correction
from chunker import chunk_markdown
from core.prompt_templates import get_markdown_correction_prompt
from core.markdown_correction_service import MarkdownCorrectionService

# Other modules
import mlflow
from mlflow.models import evaluate
import pandas as pd

### Define Constants and Paths

In [None]:
CONFIG_PATH = "../configs/configs.yaml"
SECRETS_PATH = "../configs/secrets.yaml"
LOCAL_MODEL_PATH = "/home/jovyan/datafabric/llama2-7b/ggml-model-f16-Q5_K_M.gguf"

### Configuration and Secrets Loading

In [None]:
config, secrets = load_config_and_secrets(CONFIG_PATH, SECRETS_PATH)

In [None]:
# Create Logger
logger = logging.getLogger("english-correction-notebook")
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", 
                             datefmt="%Y-%m-%d %H:%M:%S") 

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.propagate = False

## Step 1: Extracting and Parsing Markdown Files From GitHub Repositories

### Extract Markdown Files

In [None]:
# Repo URL and token
repo_url = "https://github.com/hp-david/test/tree/main"
access_token = secrets.get("GITHUB_ACCESS_TOKEN")

# Create processor instance
processor = GitHubMarkdownProcessor(repo_url=repo_url, access_token=access_token)

# Run preprocessing workflow
markdowns = processor.run()

### Parse Markdown Files with Placeholders

In [None]:
parsed_markdowns = {}
placeholder_maps = {}

for filename, content in markdowns.items():
    # Parse the content and get placeholder map
    placeholder_map, processed_content = parse_md_for_grammar_correction(content)
    
    # Store the processed content (maintains dictionary structure for chunker)
    parsed_markdowns[filename] = processed_content
    
    # Store the placeholder map for later restoration
    placeholder_maps[filename] = placeholder_map

logger.info(f"Parsed {len(parsed_markdowns)} files successfully")

### Chunk Markdown Content

In [None]:
all_chunks = {}  

for file_name, content in parsed_markdowns.items():
    chunks = chunk_markdown(content, max_tokens=100)
    all_chunks[file_name] = chunks

# Print chunks during testing
for file_name, chunks in all_chunks.items():
    logger.info(f"\n===== {file_name} =====\n")
    for i, chunk in enumerate(chunks):
        logger.info(f"\n--- Chunk {i+1} ---\n")
        logger.info(chunk)
        logger.info("\n" + "-" * 40 + "\n")

## Step 2: Correct Markdown Files with LLM

In [None]:
# Get markdown correction prompt from prompt_templates module
correction_prompt = get_markdown_correction_prompt()

### Initialize Mode

In [None]:
if "model_source" in config:
    model_source = config["model_source"]

# Initialize llm 
llm = initialize_llm(model_source, secrets, LOCAL_MODEL_PATH)

# Create the LLM chain with the correction prompt
llm_chain = correction_prompt | llm

### Invoke Model on Each Chunk

In [None]:
results = []

for file_name, chunks in all_chunks.items():  
    for chunk in chunks:
        response = llm_chain.invoke({"markdown": chunk})
        results.append({
            "file": file_name,
            "original": chunk,
            "corrected": response
        })

In [None]:
# Print results during testing
for result in results:
    logger.info(f"\n===== {result['file']} =====\n")
    logger.info("--- Original ---\n")
    logger.info(result["original"])
    logger.info("\n--- Corrected ---\n")
    logger.info(result["corrected"])
    logger.info("\n" + "=" * 60 + "\n")

## ML Flow Logging and Eval

### Reigster the Model with ML Flow

In [None]:
mlflow.set_experiment("markdown-correction-experiment")

with mlflow.start_run(run_name="markdown-correction-run") as run:
    MarkdownCorrectionService.log_model(
        llm_artifact=LOCAL_MODEL_PATH,
        config_yaml=CONFIG_PATH,
        secrets_yaml=SECRETS_PATH,
    )

    model_uri = f"runs:/{run.info.run_id}/markdown_corrector"
    mlflow.register_model(model_uri, "MarkdownCorrector")

    logger.info(f"Model registered: MarkdownCorrector")

### Testing ML Flow LLM Evaluation

In [None]:
from mlflow.metrics import ari_grade_level, flesch_kincaid_grade_level, exact_match, rouge1, rougeL

eval_df = pd.DataFrame([
    {"markdown": item["original"], "corrected": item["corrected"]}
    for item in results
])

results = mlflow.evaluate(
    model=model_uri,
    data=eval_df,
    targets="corrected",
    extra_metrics=[
        ari_grade_level(),
        flesch_kincaid_grade_level(),
        exact_match(),
        rouge1(),
        rougeL()
    ]
)

logger.info("Evaluation results:")
logger.info(results.metrics)

In [None]:
mlflow.log_metrics(results.metrics)