In [None]:
import json
import re
from pathlib import Path

# Step 1: Load the input JSONL file
input_path = Path("qa_confidence_gte_085.jsonl")
qa_data = []
with input_path.open("r", encoding="utf-8") as f:
    for line in f:
        qa_data.append(json.loads(line))

# Step 2: Utility functions for extraction and filtering
def extract_context_and_question(entry_input):
    """Extract context and question from formatted input."""
    context_match = re.search(r"### Context:\n(.*?)\n\n### Question:", entry_input, re.DOTALL)
    question_match = re.search(r"### Question:\nquestion:\s*(.*)", entry_input)
    context = context_match.group(1).strip() if context_match else ""
    question = question_match.group(1).strip() if question_match else ""
    return context, question

def is_good_qa_pair(context, question, answer):
    """Apply quality checks on question, answer, and context."""
    if not question or not answer:
        return False
    if len(question.split()) < 4 or len(answer.split()) < 2:
        return False
    if answer.lower() in {"yes", "no", "none", "unknown", "user equipment"}:
        return False
    if answer not in context:
        return False
    telecom_keywords = [
        "5g", "ue", "gnb", "amf", "qos", "network", "nr", "radio", "cell",
        "coverage", "mobility", "handover", "frequency", "interface", "uplink", "downlink"
    ]
    if not any(kw in context.lower() for kw in telecom_keywords):
        return False
    return True

def contains_figure_or_header_ref(question, context):
    """Filter out if 'figure' or numbered headers like 6.7.3.2-1 exist."""
    if "figure" in question.lower():
        return True
    header_pattern = re.compile(r"\b\d+(\.\d+){1,5}(-\d+)?\.?\b")
    if header_pattern.search(context):
        return True
    return False

# Step 3: Apply filtering logic
final_filtered_qa = []
for entry in qa_data:
    context, question = extract_context_and_question(entry["input"])
    answer = entry["output"].strip()

    if is_good_qa_pair(context, question, answer):
        if not contains_figure_or_header_ref(question, context):
            final_filtered_qa.append({
                "question": question,
                "answer": answer,
                "context": context,
                "source_doc": entry.get("source_doc", ""),
                "chunk_id": entry.get("chunk_id", -1)
            })

# Step 4: Save filtered QA pairs to new JSONL file
output_path = Path("final_filtered_qa.jsonl")  # ⬅️ Output path
with output_path.open("w", encoding="utf-8") as f:
    for item in final_filtered_qa:
        json.dump(item, f)
        f.write("\n")

print(f"✅ Done. Total high-quality QA pairs: {len(final_filtered_qa)}")
print(f"📁 Output saved to: {output_path.resolve()}")