# Preprocessing Code for TeleQnA, for the use of Fine-Tunning of LLaMA-2

In [11]:
import json

input_path = r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Dataset\TeleQnA Dataset\TeleQnA.json"
output_path = r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\teleqna_llama2_with_explanations.jsonl"

processed_data = []

# Load input JSON
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert each question entry
for q_id, entry in data.items():
    question = entry.get("question", "").strip()
    options = [
        entry.get("option 1", "").strip(),
        entry.get("option 2", "").strip(),
        entry.get("option 3", "").strip(),
        entry.get("option 4", "").strip()
    ]
    answer = entry.get("answer", "").strip()
    explanation = entry.get("explanation", "").strip()

    if not question or not answer:
        continue

    options = [opt for opt in options if opt]

    # Construct prompt
    mcq_text = f"{question}\n"
    for i, opt in enumerate(options, start=1):
        mcq_text += f"Option {i}: {opt}\n"
    mcq_text = mcq_text.strip()

    # Format assistant response
    response = f"{answer}"
    if explanation:
        response += f"\n\nExplanation: {explanation}"

    # LLaMA-2 format
    full_prompt = (
        "<s>[INST] <<SYS>>\n"
        "You are a helpful assistant specialized in telecommunications.\n"
        "[/SYS]\n\n"
        f"{mcq_text} [/INST]\n"
        f"{response}</s>"
    )

    processed_data.append({"text": full_prompt})

# Save as JSONL
with open(output_path, "w", encoding="utf-8") as f:
    for item in processed_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Saved {len(processed_data)} examples with explanations to {output_path}")

✅ Saved 10000 examples with explanations to C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\teleqna_llama2_with_explanations.jsonl


# Preprocessing Code for Tabular TeleQuAD, for the use of Fine Tunning of LLaMA-2

In [15]:
from pathlib import Path
import json

# Paths
input_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Dataset\TeleQuAD\TeleQuAD-v1-full-Tabular.json")
output_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\telequad_tabular_llama2_preprocessed.jsonl")

# Load top-level JSON object
with input_path.open("r", encoding="utf-8") as f:
    raw = json.load(f)

converted = []
for block in raw.get("data", []):
    for q in block.get("questions", []):
        context = q.get("context", "").strip()
        question = q.get("question", "").strip()
        answer = q.get("answer", "").strip()

        if not context or not question or not answer:
            continue

        # Format prompt
        user_prompt = f"### Task: extractive_qa\n### Context:\n{context}\n### Question:\n{question}"
        response = answer

        # LLaMA-2 format
        full_prompt = (
            "<s>[INST] <<SYS>>\n"
            "You are a helpful assistant specialized in telecommunications.\n"
            "[/SYS]\n\n"
            f"{user_prompt} [/INST]\n"
            f"{response}</s>"
        )

        converted.append({"text": full_prompt})

# Save as JSONL
with output_path.open("w", encoding="utf-8") as f:
    for item in converted:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Successfully saved {len(converted)} QA examples to {output_path}")

✅ Successfully saved 278 QA examples to C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\telequad_tabular_llama2_preprocessed.jsonl


# Preprocessing Code for TeleQuAD-v4-full.json

In [16]:
from pathlib import Path
import json

# Paths
input_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Dataset\TeleQuAD\TeleQuAD-v4-full.json")
output_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\telequad_v4_llama2_preprocessed.jsonl")

# Load file
with input_path.open("r", encoding="utf-8") as f:
    raw = json.load(f)

converted = []

for doc in raw["data"]:
    for para in doc.get("paragraphs", []):
        context = para.get("context", "").strip()
        if not context:
            continue 

        for qa in para.get("qas", []):
            if qa.get("is_impossible", False):
                continue

            question = qa.get("question", "").strip()
            answer_list = qa.get("answers", [])
            if not question or not answer_list:
                continue

            answer = answer_list[0].get("text", "").strip()
            if not answer:
                continue

            # Format instruction prompt
            user_prompt = f"### Task: extractive_qa\n### Context:\n{context}\n### Question:\n{question}"
            response = answer

            full_prompt = (
                "<s>[INST] <<SYS>>\n"
                "You are a helpful assistant specialized in telecommunications.\n"
                "[/SYS]\n\n"
                f"{user_prompt} [/INST]\n"
                f"{response}</s>"
            )

            converted.append({"text": full_prompt})

# Save to JSONL
with output_path.open("w", encoding="utf-8") as f:
    for item in converted:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Saved {len(converted)} entries to {output_path}")

✅ Saved 4262 entries to C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\telequad_v4_llama2_preprocessed.jsonl


# Preprocessing Code for 3GPP QA JSONL

In [2]:
from pathlib import Path
import json

input_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\First_RUN_final_3gpp_qa_filtered.jsonl")
output_path = Path(r"C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\3gpp_testing.jsonl")

converted = []

with input_path.open("r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)

        raw_input = item.get("input", "").strip()
        answer = item.get("output", "").strip()

        if not raw_input or not answer:
            continue 

        full_prompt = (
            "<s>[INST] <<SYS>>\n"
            "You are a helpful assistant specialized in telecommunications.\n"
            "[/SYS]\n\n"
            f"{raw_input} [/INST]\n"
            f"{answer}</s>"
        )

        converted.append({"text": full_prompt})

# Save as JSONL
with output_path.open("w", encoding="utf-8") as f:
    for item in converted:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Saved {len(converted)} entries to {output_path}")

✅ Saved 218347 entries to C:\Users\Rishabh\OneDrive\Desktop\THESIS\Fine Tunning Dataset\3gpp_testing.jsonl
