# Semantic Redundancy Detection Using FAISS + SBERT

In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import re
from pathlib import Path
from tqdm import tqdm
import time

# Extract question from 'input' field
def extract_question(input_text):
    match = re.search(r"### Question:\n(?:question:\s*)?(.*)", input_text)
    return match.group(1).strip() if match else ""

# Load QA Dataset
qa_path = Path("/mnt/data/First_RUN_final_3gpp_qa_filtered.jsonl")
qa_data = [json.loads(line) for line in qa_path.open("r", encoding="utf-8")]
questions = [extract_question(q["input"]) for q in qa_data]

print(f"Loaded {len(questions)} questions...")

# Load SBERT Model on GPU for fast embedding
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

# Generate Embeddings
print("Encoding questions...")
embeddings = model.encode(questions, convert_to_numpy=True, show_progress_bar=True)

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Build FAISS CPU Index
print("Building FAISS CPU index...")
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Inner Product for cosine similarity
index.add(embeddings)

# Chunked CPU-based Search to avoid RAM spikes
print("Running chunked FAISS search on CPU...")
batch_size = 20000
all_scores = []
all_indices = []

start = time.time()
for i in tqdm(range(0, len(embeddings), batch_size), desc="Searching..."):
    batch = embeddings[i:i + batch_size]
    D, I = index.search(batch, 2)  # Top-2 similarity
    all_scores.append(D)
    all_indices.append(I)
end = time.time()
print(f"Search completed in {end - start:.2f} seconds.")

# Merge Results
D = np.vstack(all_scores)
I = np.vstack(all_indices)

# Analyze Redundant Pairs
threshold = 0.95
redundant_pairs = []
for i in range(len(questions)):
    j = I[i][1]  # skip self-match
    score = D[i][1]
    if score > threshold:
        redundant_pairs.append((i, j, score))

# Summary Stats
print("\n=== Semantic Redundancy (FAISS CPU) ===")
print(f"Total QA pairs: {len(questions)}")
print(f"Highly similar (cosine > {threshold}): {len(redundant_pairs)}")
print(f"Redundancy rate: {len(redundant_pairs) / len(questions) * 100:.2f}%")

# Show Top 10 Redundant Question Pairs
print("\n=== Top 10 Most Similar Question Pairs ===")
top_pairs = sorted(redundant_pairs, key=lambda x: -x[2])[:10]
for i, j, score in top_pairs:
    print(f"[{score:.3f}] Q{i}: {questions[i]}\n        Q{j}: {questions[j]}\n")

Loaded 218347 questions...
Encoding questions...


Batches:   0%|          | 0/6824 [00:00<?, ?it/s]

Building FAISS CPU index...
Running chunked FAISS search on CPU...


Searching...: 100%|██████████████████████████| 11/11 [07:07<00:00, 38.83s/it]


Search completed in 427.10 seconds.

=== Semantic Redundancy (FAISS CPU) ===
Total QA pairs: 218347
Highly similar (cosine > 0.95): 108948
Redundancy rate: 49.90%

=== Top 10 Most Similar Question Pairs ===
[1.000] Q189687: What does the VLR support but does not support the Operator Determined Barring category?
        Q95485: What does the VLR support but does not support the Operator Determined Barring category?

[1.000] Q66668: What clause states that the message "Mess_Sync_Lost" is sent only at the first five occurrences of errors in Tandem Free Operation (TFO) Frames or loss of synchronisation?
        Q66668: What clause states that the message "Mess_Sync_Lost" is sent only at the first five occurrences of errors in Tandem Free Operation (TFO) Frames or loss of synchronisation?

[1.000] Q95485: What does the VLR support but does not support the Operator Determined Barring category?
        Q95485: What does the VLR support but does not support the Operator Determined Barring cate

# Model-Based QA Answer Accuracy (EM & F1)

In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import json
from pathlib import Path
import numpy as np
import re

# Load fine-tuned telecom QA model
model_path = "/home/ec2-user/qa_roberta_telecom"
qa_pipeline = pipeline(
    "question-answering",
    model=AutoModelForQuestionAnswering.from_pretrained(model_path),
    tokenizer=AutoTokenizer.from_pretrained(model_path),
    device=0
)

# Load data
qa_path = Path("/mnt/data/First_RUN_final_3gpp_qa_filtered.jsonl")
qa_data = [json.loads(line) for line in qa_path.open("r", encoding="utf-8")]

# Prepare inputs
inputs = []
gold_answers = []
for item in qa_data:
    input_text = item.get("input", "")
    match = re.search(r"### Context:\n(.*?)\n\n### Question:\n(?:question:\s*)?(.*)", input_text, re.DOTALL)
    if match:
        context = match.group(1).strip()
        question = match.group(2).strip()
    else:
        continue  # skip malformed entries
    gold = item["output"].strip()
    if context and question and gold:
        inputs.append({"context": context, "question": question})
        gold_answers.append(gold)

# Chunked QA inference
print("Running chunked QA inference...")
predictions = []
batch_size = 512  # adjust to 256/128 if OOM
for i in tqdm(range(0, len(inputs), batch_size)):
    batch = inputs[i:i + batch_size]
    preds = qa_pipeline(batch)
    predictions.extend(preds)

# Evaluation
def compute_f1(pred, truth):
    pred_tokens = pred.lower().split()
    truth_tokens = truth.lower().split()
    common = set(pred_tokens) & set(truth_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

em_scores = []
f1_scores = []
for pred, gold in zip(predictions, gold_answers):
    p = pred["answer"].strip()
    g = gold.strip()
    em_scores.append(int(p.lower() == g.lower()))
    f1_scores.append(compute_f1(p, g))

# Final report
print("\n=== Batch QA Evaluation Results ===")
print(f"Total evaluated: {len(em_scores)}")
print(f"Exact Match (EM): {np.mean(em_scores):.4f}")
print(f"F1 Score:          {np.mean(f1_scores):.4f}")

Device set to use cuda:0


Running chunked QA inference...


  2%|▉                                      | 10/427 [00:41<28:40,  4.13s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████████████████████████████████| 427/427 [31:30<00:00,  4.43s/it]



=== Batch QA Evaluation Results ===
Total evaluated: 218347
Exact Match (EM): 0.9543
F1 Score:          0.9651
