In [None]:
# -*- coding: utf-8 -*-
"""
Data Pipeline: Extract → Clean → Chunk → Generate Real Q&A → Save JSON + Cleaned Text
Only REAL Q&A | JSON format with supporting_passages | Cleaned text: 1 sentence per line
"""

# =============================================
# STEP 0: SETUP (Run once)
# =============================================
!pip install -q PyPDF2 transformers torch sentence-transformers pandas tqdm nltk

import PyPDF2
import re
import json
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from tqdm import tqdm
from google.colab import files
import nltk

nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

# Clear GPU cache
torch.cuda.empty_cache()
print("Setup complete!")



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hSetup complete!


In [None]:
# =============================================
# STEP 1: UPLOAD YOUR PDF
# =============================================
uploaded = files.upload()  # Upload your bank_policies.pdf
pdf_path = list(uploaded.keys())[0]
print(f"Uploaded: {pdf_path}")

CHUNK_SIZE          = 500       # words per chunk
OVERLAP             = 100
QUESTIONS_PER_CHUNK = 7
MAX_NEW_TOKENS      = 350

def is_file_pdf(name):  return os.path.splitext(name)[1].lower() == '.pdf'
def is_file_txt(name):  return os.path.splitext(name)[1].lower() == '.txt'



Saving amanah_bank_policy.pdf to amanah_bank_policy.pdf
Uploaded: amanah_bank_policy.pdf


In [None]:
# =============================================
# STEP 2: EXTRACT TEXT FROM PDF/TXT
# =============================================
def extract_text_from_pdf(file_path):
    if is_file_pdf(file_path):
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            return "\n".join(page.extract_text() or "" for page in reader.pages)
    elif is_file_txt(file_path):
        return open(file_path, "r", encoding="utf-8").read()
    else:
        raise ValueError(f"Unsupported file type: {file_path}")

raw_text = extract_text_from_pdf(pdf_path)
print(f"Extracted {len(raw_text.split())} words ({len(raw_text)} chars).")



Extracted 46224 words (374721 chars).


In [None]:
# =============================================
# STEP 3: CLEAN TEXT
# =============================================
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Collapse whitespace
    text = re.sub(r'[^a-zA-Z0-9.,;:!?()$%\- ]', '', text)  # Keep safe chars
    return text.strip()

cleaned_text = clean_text(raw_text)
print(f"Cleaned text: {len(cleaned_text.split())} words")



Cleaned text: 46183 words


In [None]:
# =============================================
# STEP 4: SAVE CLEANED TEXT – ONE SENTENCE PER LINE
# =============================================
def save_cleaned_text_one_sentence_per_line(text, filename="cleaned_bank_corpus.txt"):
    """Save cleaned text with one sentence per line."""
    sentences = sent_tokenize(text)
    with open(filename, "w", encoding="utf-8") as f:
        for sent in sentences:
            sent = sent.strip()
            if sent:
                f.write(sent + "\n")
    print(f"Cleaned corpus saved → {filename} ({len(sentences)} sentences)")

save_cleaned_text_one_sentence_per_line(cleaned_text)



Cleaned corpus saved → cleaned_bank_corpus.txt (2443 sentences)


In [None]:
# =============================================
# STEP 5: CHUNK TEXT
# =============================================
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

chunks = chunk_text(cleaned_text, chunk_size=CHUNK_SIZE, overlap=OVERLAP)
print(f"Created {len(chunks)} chunks ({CHUNK_SIZE} words, {OVERLAP} overlap)")



Created 116 chunks (500 words, 100 overlap)


In [None]:
# =============================================
# STEP 6: LOAD Phi-2 FOR Q/A GENERATION
# =============================================
phi_model_name = "microsoft/phi-2"
phi_tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
phi_model = AutoModelForCausalLM.from_pretrained(
    phi_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# qa_pipeline = pipeline(
#     "text-generation",
#     model=phi_model,
#     tokenizer=phi_tokenizer,
#     max_new_tokens=MAX_NEW_TOKENS,
#     temperature=0.3,
#     top_p=0.92,
#     do_sample=True,
#     pad_token_id=phi_tokenizer.eos_token_id,
#     return_full_text=False,            # <-- ONLY the new text
#     eos_token_id=phi_tokenizer.eos_token_id,
#     # force the model to stop only at </s> or a blank line
#     stopping_criteria=[
#         lambda input_ids, scores: input_ids[-1] == phi_tokenizer.eos_token_id
#     ]
# )
# qa_pipeline.tokenizer.pad_token = qa_pipeline.tokenizer.eos_token
phi_tokenizer.pad_token = phi_tokenizer.eos_token
phi_model.config.pad_token_id = phi_tokenizer.eos_token_id
print("Phi-2 loaded for Q&A generation")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Phi-2 loaded for Q&A generation


In [None]:
# =============================================
# STEP 7: BUILD PROMPT & GENERATE Q&A
# =============================================
def build_prompt(chunk):
    example = """
Example:
Passage: A dormant account has no transactions for 12 months.
Q1: What is a dormant account?
A1: An account is classified as dormant if there are no customer-initiated transactions for a continuous period of twelve months.
"""
    return f"""You are a banking expert. Generate EXACTLY 7 Q&A pairs from the passage below.

{example}

Passage:
{chunk.strip()}

OUTPUT ONLY THE 7 PAIRS IN THIS FORMAT (no extra text, no explanations):

Q1: <question>?
A1: <answer>

Q2: <question>?
A2: <answer>

...

Q7: <question>?
A7: <answer>
"""
def generate_qa_from_chunk(chunk):
    prompt = build_prompt(chunk)

    # Tokenize
    inputs = phi_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to(phi_model.device)

    try:
        # Direct model call — NO pipeline, NO caching, NO batching
        with torch.no_grad():
            output_ids = phi_model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.3,
                top_p=0.92,
                do_sample=True,
                pad_token_id=phi_tokenizer.eos_token_id,
                eos_token_id=phi_tokenizer.eos_token_id
            )

        # Decode only the new part
        generated = phi_tokenizer.decode(
            output_ids[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        if not generated:
            print(f"Warning: Empty output for chunk {len(qa_pairs)}")

        return generated

    except Exception as e:
        print(f"Generation error: {e}")
        return ""

In [None]:
# =============================================
# STEP 8: GENERATE Q&A FROM CHUNKS
# =============================================
qa_pairs = []
print("Generating Q&A pairs from chunks...")
for i, chunk in enumerate(tqdm(chunks[:50])):  # Remove [:5] to process all
    raw_qa = generate_qa_from_chunk(chunk)
    # print("this is the raw_qa", raw_qa)
    if raw_qa:
        qa_pairs.append({"chunk_id": i, "chunk_text": chunk, "raw_qa": raw_qa})

print(f"Generated {len(qa_pairs)} raw Q&A blocks")



Generating Q&A pairs from chunks...


 62%|██████▏   | 31/50 [05:23<02:44,  8.66s/it]



100%|██████████| 50/50 [08:42<00:00, 10.45s/it]

Generated 49 raw Q&A blocks





In [None]:
# =============================================
# STEP 9: PARSE Q&A INTO STRUCTURED FORMAT
# =============================================
def parse_qa_block(block):
    lines = [line.strip() for line in block.split('\n') if line.strip()]
    questions, answers = [], []
    i = 0
    while i < len(lines) - 1:
        q_line = lines[i]
        a_line = lines[i + 1]
                # Relaxed matching: allow spaces, dots, missing ?, case-insensitive
        q_match = re.match(r'^\s*Q\s*(\d+)[\s:.-]*([^?]*)(\?)?\s*$', q_line, re.IGNORECASE)
        a_match = re.match(r'^\s*A\s*(\d+)[\s:.-]*\s*(.+)$', a_line, re.IGNORECASE)
        if q_match and a_match:
            q_num = q_match.group(1)
            a_num = a_match.group(1)
            if q_num == a_num:
                question = q_match.group(2).strip()
                answer = a_match.group(2).strip()
                if question and answer:  # Ensure non-blank
                    questions.append(question)
                    answers.append(answer)
                i += 2
                continue
        i += 1
    return list(zip(questions, answers))


# Build final JSON dataset
final_dataset = []
for item in qa_pairs:
    parsed = parse_qa_block(item["raw_qa"])
    for q, a in parsed:
        final_dataset.append({
            "question": q,
            "answer": a,
            "supporting_passages": [item["chunk_text"]]  # Original chunk as evidence
        })

print(f"Parsed {len(final_dataset)} real Q&A pairs with supporting passages")


# After building final_dataset
cleaned_dataset = []
for item in final_dataset:
    q = item["question"].strip()
    a = item["answer"].strip()
    # Remove entries with placeholders
    if "<question>" in q or "<answer>" in a or not q or not a:
        continue
    cleaned_dataset.append(item)

final_dataset = cleaned_dataset
print(f"After cleaning: {len(final_dataset)} valid Q&A pairs")


Parsed 111 real Q&A pairs with supporting passages
After cleaning: 99 valid Q&A pairs


In [None]:
# =============================================
# STEP 10: SAVE TO JSON (List of dictionaries)
# =============================================
json_output_file = "bank_qa_dataset.json"

with open(json_output_file, "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, indent=2, ensure_ascii=False)

print(f"Saved {len(final_dataset)} real Q&A records → {json_output_file}")

# Optional: Show first 2 entries
print("\nSample JSON entries:")
for entry in final_dataset[:2]:
    print(json.dumps(entry, indent=2))



Saved 99 real Q&A records → bank_qa_dataset.json

Sample JSON entries:
{
  "question": "What is a dormant account",
  "answer": "An account is classified as dormant if there are no customer-initiated transactions for a continuous period of twelve months.",
  "supporting_passages": [
    "before issuing cheques or authorizing payments. Joint account holders must clearly communicate authority levels and transaction permissions. Customers are required to review account statements regularly and report discrepancies within thirty days. 6. Bank Responsibilities Amanah Bank shall process account opening applications within three business days of receiving complete documentation. The Bank will provide clear communication about account features, fees, interest rates, and terms and conditions at the time of account opening. Monthly statements will be provided through the customers preferred channel (physical or electronic) within five business days of month-end. The Bank shall maintain strict co

In [None]:
# =============================================
# STEP 11: DOWNLOAD FILES
# =============================================
files.download(json_output_file)
files.download("cleaned_bank_corpus.txt")

print("\nPipeline complete!")
print(f"→ {json_output_file} (real Q&A with supporting passages)")
print(f"→ cleaned_bank_corpus.txt (1 sentence per line)")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Pipeline complete!
→ bank_qa_dataset.json (real Q&A with supporting passages)
→ cleaned_bank_corpus.txt (1 sentence per line)
