# Downloading Llama-2-7b-hf model

In [None]:
from huggingface_hub import snapshot_download

In [None]:
snapshot_download(
    repo_id="meta-llama/Llama-2-7b-hf",
    local_dir="/mnt/data/llama2-model",
    token=True
)

In [None]:
# Check for the downloads in EC2
ls -lh /mnt/data/llama2-model

# Test Pipeline 1 for Question Answer Generation Pipeline
Notebook Overview

This notebook builds a pipeline for generating and filtering question–answer (QA) pairs from technical telecom documents (DOCX format). The main steps are:

Model Loading

A T5-based Question Generation (QG) model generates candidate questions from text chunks.

A fine-tuned RoBERTa QA model (trained on telecom data) extracts answer spans from the same context.

Text Preprocessing

Raw text is cleaned to remove boilerplate (tables, figures, metadata, headers, etc.).

The text is split into overlapping chunks to preserve context for both QG and QA models.

QA Pair Generation

For each chunk, the longest sentences are highlighted to guide the QG model.

Generated questions are passed to the QA model to extract precise answers.

Low-confidence, trivial, or invalid answers are discarded.

Valid pairs are saved with metadata (context, question, answer, confidence).

Filtering for Quality

A post-processing step applies heuristics (confidence threshold, answer-in-context checks, vague question removal).

Only “good” QA pairs are retained and saved in JSONL format.

Output

Two JSONL files are produced:

Raw QA pairs (qa_output.jsonl)

Filtered high-quality QA pairs (qa_output_filtered_good.jsonl)

This ensures that only reliable, domain-grounded QA pairs are preserved for downstream use, such as fine-tuning telecom-specific extractive QA models.

In [1]:
import json
from pathlib import Path
from transformers import pipeline
from tqdm import tqdm
import nltk
import re
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from docx import Document
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load models
qg_model = pipeline("text2text-generation", model="mrm8488/t5-base-finetuned-question-generation-ap", device=0, batch_size=8)
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load fine-tuned telecom QA model
model_path = "/home/ec2-user/qa_roberta_telecom"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

qa_model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0, batch_size=16)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://g

In [3]:
# Clean DOCX text
def clean_technical_text(raw_text: str) -> str:
    cleaned_lines = []
    for line in raw_text.split("\n"):
        line = line.strip()
        if not line or len(line.split()) < 5:
            continue
        if re.match(r"^(figure|fig\.|table)\s*\d+", line, re.IGNORECASE):
            continue
        if re.match(r"^\d+(\.\d+){0,4}\s+[A-Z]", line):
            continue
        if any(kw in line.lower() for kw in ["3gpp", "etsi", "confidential", "page", "table of contents", "index", "appendix"]):
            continue
        if line.isupper() and len(line.split()) < 10:
            continue
        cleaned_lines.append(line)
    return " ".join(cleaned_lines)

In [4]:
# Chunking with overlap
def split_with_overlap(text, max_words=150, overlap=30):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

In [5]:
import re

def select_highlight_sentence(chunk):
    # Split by punctuation followed by a space and uppercase letter
    sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', chunk.strip())
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]

    if not sentences:
        return None
    if len(sentences) == 1:
        return sentences[0]

    # Fallback scoring: use longest sentence
    highlight = max(sentences, key=lambda s: len(s))
    return highlight

In [None]:
def generate_qa_pairs_from_text(text: str, source: str):
    chunks = split_with_overlap(text, max_words=250, overlap=50)
    qa_pairs = []

    for i, chunk in enumerate(tqdm(chunks, desc="Generating QA pairs")):
        # Skip likely TOC or meta sections
        if re.search(r'\d+\.\d+.*?:?', chunk) and len(chunk) < 400:
            continue
        if sum(c.isdigit() for c in chunk) / max(len(chunk), 1) > 0.3:
            continue

        # Try top 2 longest sentences as highlight options
        sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', chunk.strip())
        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
        top_sentences = sorted(sentences, key=len, reverse=True)[:2]

        for highlight_sent in top_sentences:
            q_input = f"highlight: {chunk.replace(highlight_sent, f'<hl> {highlight_sent} <hl>')}"

            try:
                question = qg_model(q_input, max_new_tokens=64)[0]['generated_text']

                # Skip vague prompts
                if question.lower().startswith("what is the highlight"):
                    continue

                result = qa_model({'question': question, 'context': chunk})

                if result['score'] < 0.2:
                    continue
                answer = result['answer'].strip()

                # Answer must be extractive (in context) and non-trivial
                if len(answer) < 3 or answer.lower() not in chunk.lower():
                    continue

            except Exception as e:
                print(f"[ERROR] Chunk {i}: {e}")
                continue

            qa_pairs.append({
                "instruction": "Extract the correct answer span from the telecom document context.",
                "input": f"### Task: extractive_qa\n### Context:\n{chunk}\n\n### Question:\n{question}\n\n### Answer:",
                "output": answer,
                "source_doc": source,
                "chunk_id": i,
                "confidence": round(result['score'], 3)
            })

    return qa_pairs

In [None]:
# Process DOCX file
def process_docx_file(input_docx_path, output_jsonl_path):
    input_path = Path(input_docx_path)
    doc = Document(input_path)
    full_text = "\n".join([p.text for p in doc.paragraphs])
    cleaned = clean_technical_text(full_text)
    qa_pairs = generate_qa_pairs_from_text(cleaned, source=input_path.name)

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    print(f" Saved {len(qa_pairs)} QA pairs to {output_jsonl_path}")

In [8]:
process_docx_file("/mnt/data/Datasets/38104-j00.docx", "/mnt/data/qa_output.jsonl")

Generating QA pairs:  15%|███▏                 | 34/228 [00:10<01:11,  2.72it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating QA pairs: 100%|████████████████████| 228/228 [04:09<00:00,  1.10s/it]

🔍 Total Paragraphs: 3634
✅ QA Pairs: 275
🚫 Skipped: 1486 | ❌ Errors: 0
✅ Saved 275 QA pairs to /mnt/data/qa_output.jsonl





In [9]:
# STEP: Filter high-quality QA pairs and save to new JSONL file
import pandas as pd

def filter_good_qa_pairs(input_jsonl_path, output_jsonl_path):
    # Load QA pairs
    qa_pairs = []
    with open(input_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            qa_pairs.append(json.loads(line))

    qa_df = pd.DataFrame(qa_pairs)

    # Extract question and context for filtering
    qa_df["question"] = qa_df["input"].str.extract(r"### Question:\n(.+?)\n\n### Answer:")
    qa_df["context"] = qa_df["input"].str.extract(r"### Context:\n(.+?)\n\n### Question:", flags=re.DOTALL)

    # Heuristic filters
    qa_df["low_confidence_flag"] = qa_df["confidence"].astype(float) < 0.4
    qa_df["mismatch_flag"] = ~qa_df.apply(lambda row: row["output"].lower() in row["context"].lower(), axis=1)
    qa_df["vague_question_flag"] = qa_df["question"].str.lower().str.startswith("what is the highlight")

    # Final label
    qa_df["quality"] = "good"
    qa_df.loc[
        qa_df["low_confidence_flag"] | qa_df["mismatch_flag"] | qa_df["vague_question_flag"],
        "quality"
    ] = "poor"

    # Filter and save
    good_qa_df = qa_df[qa_df["quality"] == "good"]
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for record in good_qa_df.to_dict(orient="records"):
            json.dump(record, fout, ensure_ascii=False)
            fout.write("\n")

    print(f" Saved {len(good_qa_df)} good QA pairs to {output_jsonl_path}")

# Run it on the generated file
filter_good_qa_pairs("/mnt/data/qa_output.jsonl", "/mnt/data/qa_output_filtered_good.jsonl")

✅ Saved 154 good QA pairs to /mnt/data/qa_output_filtered_good.jsonl


# Test Pipeline 2

In [6]:
def generate_qa_pairs_from_paragraphs(paragraphs: list[str], source: str, batch_size=16):
    qa_pairs = []
    counters = {
        "total_paragraphs": len(paragraphs),
        "qa_generated": 0,
        "qa_fallback": 0,
        "skipped_filters": 0,
        "errors": 0
    }

    for i in tqdm(range(0, len(paragraphs), batch_size), desc="Generating QA pairs"):
        batch_paras = paragraphs[i:i+batch_size]
        q_inputs = []
        batch_indices = []

        for j, para in enumerate(batch_paras):
            idx = i + j
            if len(para.split()) < 5:
                continue
            if re.search(r'\d+\.\d+.*?:?', para) and len(para) < 400:
                continue
            if sum(c.isdigit() for c in para) / max(len(para), 1) > 0.3:
                continue

            sentences = re.split(r'(?<=[.?!])\s+(?=[A-Z])', para)
            if len(sentences) < 2:
                sentences = para.split(".")

            sentences = [s.strip() for s in sentences if len(s.strip().split()) >= 5]
            top_sentences = sorted(sentences, key=len, reverse=True)[:2]

            for sent in top_sentences:
                highlight = para.replace(sent, f"<hl> {sent} <hl>")
                q_inputs.append(f"highlight: {highlight}")
                batch_indices.append((idx, para, sent, sentences))

        # Generate questions
        if not q_inputs:
            continue

        try:
            questions = qg_model(q_inputs, max_new_tokens=64)
        except Exception as e:
            print(f"[ERROR] QG batch failed: {e}")
            counters["errors"] += len(q_inputs)
            continue

        # Format QA batch inputs
        qa_inputs = [
            {"question": q["generated_text"], "context": para}
            for (_, para, _, _), q in zip(batch_indices, questions)
            if not q["generated_text"].lower().startswith("what is the highlight")
        ]

        if not qa_inputs:
            continue

        try:
            questions_list = [qa["question"] for qa in qa_inputs]
            contexts_list = [qa["context"] for qa in qa_inputs]

            answers = qa_model(question=questions_list, context=contexts_list)

            if isinstance(answers, dict):
                answers = [answers]
        except Exception as e:
            print(f"[ERROR] QA batch failed: {e}")
            counters["errors"] += len(qa_inputs)
            continue

        for (idx, para, sent, sentences), question, result in zip(batch_indices, questions, answers):
            question_text = question["generated_text"]
            if question_text.lower().startswith("what is the highlight"):
                continue

            answer = result["answer"].strip()
            score = result["score"]

            # Apply filters
            if score < 0.15 or len(answer) < 4 or not any(c.isalnum() for c in answer):
                counters["skipped_filters"] += 1
                continue
            if answer.lower() in ["yes", "no", "maybe"]:
                counters["skipped_filters"] += 1
                continue
            if not any(answer.lower() in s.lower() for s in sentences):
                counters["skipped_filters"] += 1
                continue

            qa = {
                "instruction": "Extract the correct answer span from the telecom document context.",
                "input": f"### Task: extractive_qa\n### Context:\n{para}\n\n### Question:\n{question_text}\n\n### Answer:",
                "output": answer,
                "source_doc": source,
                "chunk_id": idx,
                "confidence": round(score, 3)
            }
            qa_pairs.append(qa)
            counters["qa_generated"] += 1

    print(f" Total Paragraphs: {counters['total_paragraphs']}")
    print(f" QA Pairs: {counters['qa_generated']}")
    print(f" Skipped: {counters['skipped_filters']} |  Errors: {counters['errors']}")
    return qa_pairs


In [7]:
# DOCX Processing Wrapper
def process_docx_file(input_docx_path, output_jsonl_path):
    input_path = Path(input_docx_path)
    doc = Document(input_path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    qa_pairs = generate_qa_pairs_from_paragraphs(paragraphs, source=input_path.name)

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    print(f" Saved {len(qa_pairs)} QA pairs to {output_jsonl_path}")