# Notebook Overview

This notebook implements a pipeline for generating high-quality question–answer (QA) pairs from technical telecom documents (in .docx format). The process is designed to prepare training data for extractive QA tasks using domain-specific text. The workflow includes the following steps:

Text Cleaning:
Raw text from a Word document is cleaned to remove boilerplate content such as figure/table captions, headers, page numbers, and irrelevant metadata (e.g., “draft”, “confidential”). This ensures that only meaningful technical sentences remain.

Text Chunking
The cleaned text is split into overlapping word-based chunks (default: 150 words with 30-word overlap). This helps models handle long documents while preserving context continuity.

Model Loading:

A T5-based Question Generation (QG) model generates candidate questions by highlighting key sentences in each chunk.

A RoBERTa-based Question Answering (QA) model (fine-tuned on telecom data) extracts precise answer spans from the same context.

QA Pair Generation
For each chunk:

Candidate sentences are selected and highlighted.

The QG model proposes questions.

The QA model extracts answer spans.

Generated pairs are validated and low-quality outputs (too short, low confidence, trivial answers like “yes/no”) are skipped.
Each valid pair is saved with metadata such as confidence score and source document.

Filtering High-Quality QA Pairs
After generation, the pairs are further filtered using rules (minimum confidence, context–answer consistency, vague question checks). Only “good” pairs are retained.

Output
The final QA pairs are saved in JSONL format, ready for downstream tasks such as fine-tuning extractive QA models or building telecom-specific LLM datasets.

In [None]:
# Install Libraries
import re
import json
from pathlib import Path
from typing import List
from docx import Document
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

In [2]:
# Step 1: Clean Text
def clean_technical_text_v2(raw_text: str) -> str:
    cleaned_lines = []
    for line in raw_text.split("\n"):
        line = line.strip()
        if not line or len(line.split()) < 5:
            continue
        if re.match(r"^(figure|fig\.|table)\s*\d+", line, re.IGNORECASE):
            continue
        if re.match(r"^\d+(\.\d+){0,4}\s+[A-Z]", line):
            continue
        if any(kw in line.lower() for kw in [
            "3gpp", "etsi", "confidential", "page", "table of contents", 
            "index", "appendix", "draft", "document history"
        ]):
            continue
        if line.isupper() and len(line.split()) < 10:
            continue
        line = re.sub(r"\[\d+\]", "", line)
        line = re.sub(r"\(ETSI[^)]+\)", "", line)
        cleaned_lines.append(line)
    return " ".join(cleaned_lines)

In [3]:
# Step 2: Chunking
def split_with_overlap(text: str, max_words: int = 150, overlap: int = 30) -> List[str]:
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

def prepare_chunks_from_docx(docx_path: str, max_words=150, overlap=30) -> List[str]:
    doc = Document(docx_path)
    raw_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip())
    cleaned = clean_technical_text_v2(raw_text)
    chunks = split_with_overlap(cleaned, max_words=max_words, overlap=overlap)
    return chunks

In [4]:
# Step 3: Load Models
qg_model = pipeline("text2text-generation", model="mrm8488/t5-base-finetuned-question-generation-ap", device=0, batch_size=8)
model_path = "/home/ec2-user/qa_roberta_telecom"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0, batch_size=16)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://g

In [5]:
# Step 4: Generate QA
def generate_qa_pairs_from_chunks(chunks: List[str], source: str, batch_size=16) -> List[dict]:
    qa_pairs = []
    counters = {"total_chunks": len(chunks), "qa_generated": 0, "qa_skipped": 0, "errors": 0}

    for i in tqdm(range(0, len(chunks), batch_size), desc="Generating QA pairs"):
        batch = chunks[i:i+batch_size]
        q_inputs = []
        for chunk in batch:
            sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', chunk.strip())
            sentences = [s.strip() for s in sentences if len(s.split()) >= 5]
            if len(sentences) < 2:
                continue
            top_sentences = sorted(sentences, key=len, reverse=True)[:2]
            for sent in top_sentences:
                highlighted = chunk.replace(sent, f"<hl> {sent} <hl>")
                q_inputs.append(f"highlight: {highlighted}")

        if not q_inputs:
            continue

        try:
            questions = qg_model(q_inputs, max_new_tokens=64)
        except Exception as e:
            print(f"[ERROR] QG batch failed: {e}")
            counters["errors"] += len(q_inputs)
            continue

        qa_inputs = [
            {"question": q["generated_text"], "context": ctx}
            for q, ctx in zip(questions, batch * 2)
            if not q["generated_text"].lower().startswith("what is the highlight")
        ]

        try:
            question_list = [qa["question"] for qa in qa_inputs]
            context_list = [qa["context"] for qa in qa_inputs]
            answers = qa_model(question=question_list, context=context_list)
            if isinstance(answers, dict):
                answers = [answers]
        except Exception as e:
            print(f"[ERROR] QA batch failed: {e}")
            counters["errors"] += len(qa_inputs)
            continue

        for question, context, result in zip(question_list, context_list, answers):
            answer = result["answer"].strip()
            score = result["score"]

            if score < 0.2 or len(answer) < 4 or not any(c.isalnum() for c in answer):
                counters["qa_skipped"] += 1
                continue
            if answer.lower() in ["yes", "no", "maybe"]:
                counters["qa_skipped"] += 1
                continue
            if answer.lower() in question.lower():
                counters["qa_skipped"] += 1
                continue
            if answer.lower() not in context.lower():
                counters["qa_skipped"] += 1
                continue

            qa = {
                "instruction": "Extract the correct answer span from the telecom document context.",
                "input": f"### Task: extractive_qa\n### Context:\n{context}\n\n### Question:\n{question}\n\n### Answer:",
                "output": answer,
                "source_doc": source,
                "confidence": round(score, 3)
            }
            qa_pairs.append(qa)
            counters["qa_generated"] += 1

    print(f" Total Chunks: {counters['total_chunks']}")
    print(f" QA Pairs Generated: {counters['qa_generated']}")
    print(f" Skipped: {counters['qa_skipped']} |  Errors: {counters['errors']}")
    return qa_pairs

In [6]:
# Step 5: Full Pipeline Runner
def process_docx_file(input_docx_path: str, output_jsonl_path: str):
    input_path = Path(input_docx_path)
    chunks = prepare_chunks_from_docx(input_docx_path)
    qa_pairs = generate_qa_pairs_from_chunks(chunks, source=input_path.name)

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    print(f" Saved {len(qa_pairs)} QA pairs to {output_jsonl_path}")

In [7]:
process_docx_file("/mnt/data/Datasets/38104-j00.docx", "/mnt/data/qa_output_refactored.jsonl")

Generating QA pairs:  38%|████████▍             | 10/26 [00:35<00:59,  3.74s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating QA pairs: 100%|██████████████████████| 26/26 [01:35<00:00,  3.66s/it]

🔍 Total Chunks: 415
✅ QA Pairs Generated: 48
🚫 Skipped: 732 | ❌ Errors: 0
✅ Saved 48 QA pairs to /mnt/data/qa_output_refactored.jsonl





In [8]:
import pandas as pd

def filter_good_qa_pairs(input_jsonl_path: str, output_jsonl_path: str, min_confidence: float = 0.4):
    """
    Filter high-quality QA pairs using confidence, answer-context match, and vague question checks.
    Saves only 'good' pairs to a new JSONL file.
    """
    qa_pairs = []
    with open(input_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            qa_pairs.append(json.loads(line))

    qa_df = pd.DataFrame(qa_pairs)

    # Extract question and context for filtering
    qa_df["question"] = qa_df["input"].str.extract(r"### Question:\n(.+?)\n\n### Answer:")
    qa_df["context"] = qa_df["input"].str.extract(r"### Context:\n(.+?)\n\n### Question:", flags=re.DOTALL)

    # Flags
    qa_df["low_confidence"] = qa_df["confidence"].astype(float) < min_confidence
    qa_df["mismatch"] = ~qa_df.apply(lambda row: row["output"].lower() in row["context"].lower(), axis=1)
    qa_df["vague_q"] = qa_df["question"].str.lower().str.startswith("what is the highlight")
    qa_df["short_q"] = qa_df["question"].str.split().str.len() < 4
    qa_df["answer_in_question"] = qa_df.apply(lambda row: row["output"].lower() in row["question"].lower(), axis=1)

    # Final flag
    qa_df["quality"] = "good"
    qa_df.loc[
        qa_df["low_confidence"] | qa_df["mismatch"] | qa_df["vague_q"] | qa_df["short_q"] | qa_df["answer_in_question"],
        "quality"
    ] = "poor"

    # Filter and save
    good_df = qa_df[qa_df["quality"] == "good"]
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for record in good_df.to_dict(orient="records"):
            json.dump(record, fout, ensure_ascii=False)
            fout.write("\n")

    print(f"Filtered {len(good_df)} high-quality QA pairs saved to {output_jsonl_path}")


In [9]:
filter_good_qa_pairs(
    input_jsonl_path="/mnt/data/qa_output_refactored.jsonl",
    output_jsonl_path="/mnt/data/qa_output_filtered_good.jsonl"
)


✅ Filtered 32 high-quality QA pairs saved to /mnt/data/qa_output_filtered_good.jsonl
