# Test Pipeline 2

In [1]:
import json
import re
from pathlib import Path
from docx import Document
from tqdm import tqdm
import nltk
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForQuestionAnswering,
    pipeline
)

nltk.download("punkt")

# Load Models
qg_tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
qg_model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap").to("cuda")

qa_tokenizer = AutoTokenizer.from_pretrained("/home/ec2-user/qa_roberta_telecom")
qa_model = AutoModelForQuestionAnswering.from_pretrained("/home/ec2-user/qa_roberta_telecom").to("cuda")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer, device=0)

# Text Cleaning
def clean_technical_text(raw_text: str) -> str:
    cleaned_lines = []
    last_heading = None
    boilerplate_keywords = ["confidential", "etsi", "3gpp", "table of contents", "appendix", "index", "page", "internal use only"]

    for line in raw_text.split("\n"):
        line = line.strip()
        if not line or len(line.split()) < 5:
            continue
        if any(kw in line.lower() for kw in boilerplate_keywords):
            continue
        if re.match(r"^(figure|fig\.|table)\s*\d+", line, re.IGNORECASE):
            continue
        if re.match(r"^\d+(\.\d+){0,4}\s+[A-Z]", line):
            if len(line.split()) < 3:
                continue
            if any(kw in line.lower() for kw in boilerplate_keywords):
                continue
            last_heading = line
            continue
        if line.isupper() and len(line.split()) < 10:
            continue
        if last_heading:
            line = f"{last_heading}. {line}"
            last_heading = None
        cleaned_lines.append(line)

    return " ".join(cleaned_lines)

# Chunking
def split_with_overlap(text, max_words=200, overlap=20):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

# Sentence Selector
def select_highlight_sentence(chunk, strategy="length"):
    sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+(?=[A-Z])', chunk.strip()) if len(s.strip().split()) > 3]
    if not sentences:
        return None
    if strategy == "length":
        return max(sentences, key=len)
    elif strategy == "tfidf":
        try:
            tfidf = TfidfVectorizer().fit(sentences)
            X = tfidf.transform(sentences)
            scores = X.mean(axis=1).A1
            return sentences[scores.argmax()]
        except Exception:
            return max(sentences, key=len)
    else:
        raise ValueError("Invalid strategy for sentence selection")

# Question Generation
def batch_generate_questions(inputs, max_length=64):
    tokenized = qg_tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = qg_model.generate(**tokenized, max_length=max_length)
    return [qg_tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]

# QA Generation
def generate_qa_pairs_from_paragraphs(paragraphs: list[str], source: str, batch_size=16, strategy="length"):
    qa_pairs = []
    counters = {"total_paragraphs": len(paragraphs), "qa_generated": 0, "qa_fallback": 0, "skipped_filters": 0, "errors": 0}

    for i in tqdm(range(0, len(paragraphs), batch_size), desc="Generating QA pairs"):
        batch_paras = paragraphs[i:i+batch_size]
        q_inputs, batch_indices = [], []

        for j, para in enumerate(batch_paras):
            idx = i + j
            if len(para.split()) < 5 or (re.search(r'\d+\.\d+.*?:?', para) and len(para) < 400) or (sum(c.isdigit() for c in para) / max(len(para), 1) > 0.3):
                continue

            sent = select_highlight_sentence(para, strategy=strategy)
            if not sent:
                continue
            highlight = para.replace(sent, f"<hl> {sent} <hl>")
            q_inputs.append(f"highlight: {highlight}")
            batch_indices.append((idx, para, sent, para.split(".")))

        if not q_inputs:
            continue

        try:
            questions = batch_generate_questions(q_inputs)
            questions = [{"generated_text": q} for q in questions]
        except Exception as e:
            print(f"[ERROR] QG batch failed: {e}")
            counters["errors"] += len(q_inputs)
            continue

        qa_inputs = [
            {"question": q["generated_text"], "context": para}
            for (_, para, _, _), q in zip(batch_indices, questions)
            if not q["generated_text"].lower().startswith("what is the highlight")
        ]

        if not qa_inputs:
            continue

        try:
            questions_list = [qa["question"] for qa in qa_inputs]
            contexts_list = [qa["context"] for qa in qa_inputs]
            answers = qa_pipeline(question=questions_list, context=contexts_list)
            if isinstance(answers, dict):
                answers = [answers]
        except Exception as e:
            print(f"[ERROR] QA batch failed: {e}")
            counters["errors"] += len(qa_inputs)
            continue

        for (idx, para, sent, sentences), question, result in zip(batch_indices, questions, answers):
            question_text = question["generated_text"]
            if question_text.lower().startswith("what is the highlight"):
                continue
            answer, score = result["answer"].strip(), result["score"]
            if score < 0.15 or len(answer) < 4 or not any(c.isalnum() for c in answer):
                counters["skipped_filters"] += 1
                continue
            if answer.lower() in ["yes", "no", "maybe"]:
                counters["skipped_filters"] += 1
                continue
            if not any(answer.lower() in s.lower() for s in sentences):
                counters["skipped_filters"] += 1
                continue

            qa_pairs.append({
                "instruction": "Extract the correct answer span from the telecom document context.",
                "input": f"### Task: extractive_qa\n### Context:\n{para}\n\n### Question:\n{question_text}\n\n### Answer:",
                "output": answer,
                "source_doc": source,
                "chunk_id": idx,
                "confidence": round(score, 3)
            })
            counters["qa_generated"] += 1

    print(f"\U0001F50D Total Paragraphs: {counters['total_paragraphs']}")
    print(f" QA Pairs: {counters['qa_generated']}")
    print(f" Skipped: {counters['skipped_filters']} |  Errors: {counters['errors']}")
    return qa_pairs

# DOCX Processing
def process_docx_file(input_docx_path, output_jsonl_path, strategy="length"):
    input_path = Path(input_docx_path)
    doc = Document(input_path)
    raw_text = "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
    cleaned = clean_technical_text(raw_text)
    paragraphs = split_with_overlap(cleaned, max_words=200)
    qa_pairs = generate_qa_pairs_from_paragraphs(paragraphs, source=input_path.name, strategy=strategy)

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    print(f"✅ Saved {len(qa_pairs)} QA pairs to {output_jsonl_path}")

# Filtering Good QA Pairs
def filter_good_qa_pairs(input_jsonl_path, output_jsonl_path):
    qa_pairs = []
    with open(input_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            qa_pairs.append(json.loads(line))

    qa_df = pd.DataFrame(qa_pairs)
    qa_df["question"] = qa_df["input"].str.extract(r"### Question:\n(.+?)\n\n### Answer:")
    qa_df["context"] = qa_df["input"].str.extract(r"### Context:\n(.+?)\n\n### Question:", flags=re.DOTALL)

    qa_df["low_confidence_flag"] = qa_df["confidence"].astype(float) < 0.4
    qa_df["mismatch_flag"] = ~qa_df.apply(lambda row: row["output"].lower() in row["context"].lower(), axis=1)
    qa_df["vague_question_flag"] = qa_df["question"].str.lower().str.startswith("what is the highlight")

    qa_df["quality"] = "good"
    qa_df.loc[qa_df["low_confidence_flag"] | qa_df["mismatch_flag"] | qa_df["vague_question_flag"], "quality"] = "poor"

    good_qa_df = qa_df[qa_df["quality"] == "good"]
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for record in good_qa_df.to_dict(orient="records"):
            json.dump(record, fout, ensure_ascii=False)
            fout.write("\n")

    print(f" Saved {len(good_qa_df)} good QA pairs to {output_jsonl_path}")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Device set to use cuda:0


In [2]:
process_docx_file(
    input_docx_path="/mnt/data/Datasets/38104-j00.docx",
    output_jsonl_path="/mnt/data/qa_output.jsonl"
)

Generating QA pairs:  56%|████████████▏         | 10/18 [00:08<00:06,  1.26it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating QA pairs: 100%|██████████████████████| 18/18 [00:14<00:00,  1.24it/s]

🔍 Total Paragraphs: 287
✅ QA Pairs: 83
🚫 Skipped: 204 | ❌ Errors: 0
✅ Saved 83 QA pairs to /mnt/data/qa_output.jsonl





In [3]:
filter_good_qa_pairs(
    input_jsonl_path="/mnt/data/qa_output.jsonl",
    output_jsonl_path="/mnt/data/qa_output_filtered_good.jsonl"
)

✅ Saved 48 good QA pairs to /mnt/data/qa_output_filtered_good.jsonl


In [4]:
process_docx_file(
    input_docx_path="/mnt/data/Datasets/38104-j00.docx",
    output_jsonl_path="/mnt/data/qa_output_tfidf.jsonl",
    strategy="tfidf"
)

Generating QA pairs: 100%|██████████████████████| 18/18 [00:13<00:00,  1.29it/s]

🔍 Total Paragraphs: 287
✅ QA Pairs: 81
🚫 Skipped: 206 | ❌ Errors: 0
✅ Saved 81 QA pairs to /mnt/data/qa_output_tfidf.jsonl





In [5]:
filter_good_qa_pairs(
    input_jsonl_path="/mnt/data/qa_output_tfidf.jsonl",
    output_jsonl_path="/mnt/data/tfidf_qa_output_filtered_good.jsonl"
)

✅ Saved 45 good QA pairs to /mnt/data/tfidf_qa_output_filtered_good.jsonl


# Test Pipeline 3

In [None]:
import json
import re
from pathlib import Path
from docx import Document
from tqdm import tqdm
import nltk
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForQuestionAnswering,
    pipeline
)

nltk.download("punkt")

In [None]:
# Load Models
qg_tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
qg_model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap").to("cuda")

qa_tokenizer = AutoTokenizer.from_pretrained("/home/ec2-user/qa_roberta_telecom")
qa_model = AutoModelForQuestionAnswering.from_pretrained("/home/ec2-user/qa_roberta_telecom").to("cuda")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer, device=0)

In [None]:
# Text Cleaning
def clean_technical_text(raw_text: str) -> str:
    cleaned_lines = []
    last_heading = None
    boilerplate_keywords = [
        "confidential", "etsi", "3gpp", "table of contents",
        "appendix", "index", "page", "internal use only"
    ]

    for line in raw_text.split("\n"):
        line = line.strip()
        if not line or (len(line.split()) < 4 and ":" not in line):
            continue
        if any(kw in line.lower() for kw in boilerplate_keywords):
            continue
        if re.match(r"^(figure|fig\.|table)\s*\d+", line, re.IGNORECASE):
            continue
        if re.match(r"^\d+(\.\d+){0,4}\s+[A-Z]", line):
            if len(line.split()) < 3:
                continue
            if any(kw in line.lower() for kw in boilerplate_keywords):
                continue
            last_heading = line
            continue
        if line.isupper() and len(line.split()) < 10:
            continue

        # Remove reference brackets and long citations
        line = re.sub(r"\[\d+\]", "", line)
        line = re.sub(r"\(ETSI.*?\)", "", line)

        if last_heading:
            line = f"{last_heading}. {line}"
            last_heading = None
        cleaned_lines.append(line)

    return " ".join(cleaned_lines)

In [None]:
# Chunking
def split_with_overlap(text, max_words=200, overlap=20):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

In [None]:
# Sentence Selector

def select_highlight_sentence(chunk, strategy="length"):
    sentences = [s.strip() for s in re.split(r'(?<=[.?!])\s+(?=[A-Z])', chunk.strip()) if len(s.strip().split()) > 3]
    if not sentences:
        return None
    if strategy == "length":
        return max(sentences, key=len)
    elif strategy == "tfidf":
        try:
            tfidf = TfidfVectorizer().fit(sentences)
            X = tfidf.transform(sentences)
            scores = X.mean(axis=1).A1
            return sentences[scores.argmax()]
        except Exception:
            return max(sentences, key=len)
    else:
        raise ValueError("Invalid strategy for sentence selection")


In [None]:
# Question Generation
def batch_generate_questions(inputs, max_length=64):
    tokenized = qg_tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = qg_model.generate(**tokenized, max_length=max_length)
    return [qg_tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]

In [None]:
# QA Generation
def generate_qa_pairs_from_paragraphs(paragraphs: list[str], source: str, batch_size=16, strategy="length"):
    qa_pairs = []
    counters = {"total_paragraphs": len(paragraphs), "qa_generated": 0, "qa_fallback": 0, "skipped_filters": 0, "errors": 0}

    for i in tqdm(range(0, len(paragraphs), batch_size), desc="Generating QA pairs"):
        batch_paras = paragraphs[i:i+batch_size]
        q_inputs, batch_indices = [], []

        for j, para in enumerate(batch_paras):
            idx = i + j
            if len(para.split()) < 3:
                continue
            if re.search(r'\d+\.\d+.*?:?', para) and len(para) < 400:
                continue
            if (sum(c.isdigit() for c in para) / max(len(para), 1)) > 0.6:
                continue

            sent = select_highlight_sentence(para, strategy=strategy)
            if not sent:
                continue
            highlight = para.replace(sent, f"<hl> {sent} <hl>")
            q_inputs.append(f"highlight: {highlight}")
            batch_indices.append((idx, para, sent, para.split(".")))

        if not q_inputs:
            continue

        try:
            tokenized = qg_tokenizer(q_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
            with torch.no_grad():
                output_ids = qg_model.generate(**tokenized, max_length=64, num_beams=2)
            questions = [{"generated_text": qg_tokenizer.decode(ids, skip_special_tokens=True)} for ids in output_ids]
        except Exception as e:
            print(f"[ERROR] QG batch failed: {e}")
            counters["errors"] += len(q_inputs)
            continue

        qa_inputs = [
            {"question": q["generated_text"], "context": para}
            for (_, para, _, _), q in zip(batch_indices, questions)
            if not q["generated_text"].lower().startswith("what is the highlight")
        ]

        if not qa_inputs:
            continue

        try:
            questions_list = [qa["question"] for qa in qa_inputs]
            contexts_list = [qa["context"] for qa in qa_inputs]
            answers = qa_pipeline(question=questions_list, context=contexts_list)
            if isinstance(answers, dict):
                answers = [answers]
        except Exception as e:
            print(f"[ERROR] QA batch failed: {e}")
            counters["errors"] += len(qa_inputs)
            continue

        for (idx, para, sent, sentences), question, result in zip(batch_indices, questions, answers):
            question_text = question["generated_text"]
            if question_text.lower().startswith("what is the highlight"):
                continue
            answer, score = result["answer"].strip(), result["score"]
            if score < 0.15 or len(answer) < 4 or not any(c.isalnum() for c in answer):
                counters["skipped_filters"] += 1
                continue
            if answer.lower() in ["yes", "no", "maybe"]:
                counters["skipped_filters"] += 1
                continue
            if not any(answer.lower() in s.lower() for s in sentences):
                counters["skipped_filters"] += 1
                continue

            qa_pairs.append({
                "instruction": "Extract the correct answer span from the telecom document context.",
                "input": f"### Task: extractive_qa\n### Context:\n{para}\n\n### Question:\n{question_text}\n\n### Answer:",
                "output": answer,
                "source_doc": source,
                "chunk_id": idx,
                "confidence": round(score, 3)
            })
            counters["qa_generated"] += 1

    print(f"\U0001F50D Total Paragraphs: {counters['total_paragraphs']}")
    print(f" QA Pairs: {counters['qa_generated']}")
    print(f" Skipped: {counters['skipped_filters']} |  Errors: {counters['errors']}")
    return qa_pairs

In [None]:
# DOCX Processing
def process_docx_file(input_docx_path, output_jsonl_path, strategy="length"):
    """
    Load DOCX file, clean text, chunk with overlap, generate QA pairs, and save output.
    Uses tighter chunking: 150 words with 50-word overlap.
    """
    input_path = Path(input_docx_path)
    doc = Document(input_path)

    # Combine and clean text
    raw_text = "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
    cleaned = clean_technical_text(raw_text)

    # Apply tighter chunking for better QA coverage
    paragraphs = split_with_overlap(cleaned, max_words=150, overlap=50)

    # Generate QA pairs from processed paragraphs
    qa_pairs = generate_qa_pairs_from_paragraphs(paragraphs, source=input_path.name, strategy=strategy)

    # Save as JSONL
    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for pair in qa_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    print(f" Saved {len(qa_pairs)} QA pairs to {output_jsonl_path}")

In [None]:
# Filtering Good QA Pairs
def filter_good_qa_pairs(input_jsonl_path, output_jsonl_path):
    qa_pairs = []
    with open(input_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            qa_pairs.append(json.loads(line))

    qa_df = pd.DataFrame(qa_pairs)
    qa_df["question"] = qa_df["input"].str.extract(r"### Question:\n(.+?)\n\n### Answer:")
    qa_df["context"] = qa_df["input"].str.extract(r"### Context:\n(.+?)\n\n### Question:", flags=re.DOTALL)

    qa_df["low_confidence_flag"] = qa_df["confidence"].astype(float) < 0.4
    qa_df["mismatch_flag"] = ~qa_df.apply(lambda row: row["output"].lower() in row["context"].lower(), axis=1)
    qa_df["vague_question_flag"] = qa_df["question"].str.lower().str.startswith("what is the highlight")

    qa_df["quality"] = "good"
    qa_df.loc[qa_df["low_confidence_flag"] | qa_df["mismatch_flag"] | qa_df["vague_question_flag"], "quality"] = "poor"

    good_qa_df = qa_df[qa_df["quality"] == "good"]
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for record in good_qa_df.to_dict(orient="records"):
            json.dump(record, fout, ensure_ascii=False)
            fout.write("\n")

    print(f" Saved {len(good_qa_df)} good QA pairs to {output_jsonl_path}")

In [None]:
process_docx_file(
    input_docx_path="/mnt/data/Datasets/38104-j00.docx",
    output_jsonl_path="/mnt/data/qa_output.jsonl",
    strategy="tfidf"
)

In [None]:
filter_good_qa_pairs(
    input_jsonl_path="/mnt/data/qa_output.jsonl",
    output_jsonl_path="/mnt/data/qa_output_filtered_good.jsonl"
)