In [15]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from colorama import Fore, Style
from pydantic import BaseModel
from typing import List
import json
import os

In [16]:
import sys
sys.path.append('..')

from scripts.llm_call import llm_call

In [17]:
class QAPair(BaseModel):
    question: str
    answer: str

In [18]:
class GeneratedData(BaseModel):
    generated: List[QAPair]

In [19]:
print("QAPair schema:", QAPair.model_json_schema())
print("\nGeneratedData schema:", GeneratedData.model_json_schema())

QAPair schema: {'properties': {'question': {'title': 'Question', 'type': 'string'}, 'answer': {'title': 'Answer', 'type': 'string'}}, 'required': ['question', 'answer'], 'title': 'QAPair', 'type': 'object'}

GeneratedData schema: {'$defs': {'QAPair': {'properties': {'question': {'title': 'Question', 'type': 'string'}, 'answer': {'title': 'Answer', 'type': 'string'}}, 'required': ['question', 'answer'], 'title': 'QAPair', 'type': 'object'}}, 'properties': {'generated': {'items': {'$ref': '#/$defs/QAPair'}, 'title': 'Generated', 'type': 'array'}}, 'required': ['generated'], 'title': 'GeneratedData', 'type': 'object'}


In [24]:
source_pdf = "../data/bloomberg-terminal-1.pdf"
output_file = "../data/generated_qa_pairs.json"

generator_model = "ollama/llama3.1:8b"

print(f"Source PDF: {source_pdf}")
print(f"Output JSON: {output_file}")
print(f"Generator Model: {generator_model}")

# Check if source PDF exists
if os.path.exists(source_pdf):
    print(f"✅ Source PDF found!")
else:
    print(f"❌ Source PDF not found at {source_pdf}")

Source PDF: ../data/bloomberg-terminal-1.pdf
Output JSON: ../data/generated_qa_pairs.json
Generator Model: ollama/llama3.1:8b
✅ Source PDF found!


In [44]:
def create_prompt(data_chunk, num_records=5):
    """Create the prompt template for generating Q&A pairs"""
    return f"""You are an expert data curator assisting a machine learning engineer in creating a high-quality instruction tuning dataset. Your task is to transform 
    the provided data chunk into diverse question and answer (Q&A) pairs that will be used to fine-tune a language model for Bloomberg Terminal usage. 
    For each of the {num_records} entries, generate one or two well-structured questions that reflect different aspects of the information in the chunk. 
    Ensure a mix of longer and shorter questions. Each Q&A pair should be concise yet informative, capturing key insights from the data.
    Structure your output in JSON format, where each object contains 'question' and 'answer' fields. The JSON structure should look like this:
        "question": "Your question here...",
        "answer": "Your answer here..."
    Focus on creating clear, relevant, and varied questions that encourage the model to learn from diverse perspectives. Avoid any sensitive or biased 
    content, ensuring answers are accurate and neutral. You are allowed to use your own knowledge to add context and enhance the answers, but do not fabricate information.
    
    Data Chunk:
    ---
    {data_chunk}
    ---
    """

In [45]:
# Test the prompt function with sample data
sample_prompt = create_prompt("This is sample Bloomberg Terminal data about trading.", 3)
print("Sample prompt preview:")
print(sample_prompt + "...")

Sample prompt preview:
You are an expert data curator assisting a machine learning engineer in creating a high-quality instruction tuning dataset. Your task is to transform 
    the provided data chunk into diverse question and answer (Q&A) pairs that will be used to fine-tune a language model for Bloomberg Terminal usage. 
    For each of the 3 entries, generate one or two well-structured questions that reflect different aspects of the information in the chunk. 
    Ensure a mix of longer and shorter questions. Each Q&A pair should be concise yet informative, capturing key insights from the data.
    Structure your output in JSON format, where each object contains 'question' and 'answer' fields. The JSON structure should look like this:
        "question": "Your question here...",
        "answer": "Your answer here..."
    Focus on creating clear, relevant, and varied questions that encourage the model to learn from diverse perspectives. Avoid any sensitive or biased 
    content, en

In [46]:
print(f"Step 1: Loading and extracting text from PDF...")

# Initialize empty string to store all text
full_text = ""

# Open PDF and extract text from all pages
with fitz.open(source_pdf) as doc:
    print(f"PDF has {len(doc)} pages")
    
    for page_num, page in enumerate(doc):
        page_text = page.get_text()
        full_text += page_text
        print(f"Extracted text from page {page_num + 1}: {len(page_text)} characters")

print(f"\nTotal text extracted: {len(full_text)} characters")
print(f"First 300 characters preview:\n{full_text[:300]}...")




Step 1: Loading and extracting text from PDF...
PDF has 28 pages
Extracted text from page 1: 131 characters
Extracted text from page 2: 200 characters
Extracted text from page 3: 1027 characters
Extracted text from page 4: 758 characters
Extracted text from page 5: 651 characters
Extracted text from page 6: 596 characters
Extracted text from page 7: 1254 characters
Extracted text from page 8: 1885 characters
Extracted text from page 9: 1790 characters
Extracted text from page 10: 846 characters
Extracted text from page 11: 550 characters
Extracted text from page 12: 740 characters
Extracted text from page 13: 1428 characters
Extracted text from page 14: 581 characters
Extracted text from page 15: 1340 characters
Extracted text from page 16: 2100 characters
Extracted text from page 17: 1216 characters
Extracted text from page 18: 2310 characters
Extracted text from page 19: 1317 characters
Extracted text from page 20: 2445 characters
Extracted text from page 21: 889 characters
Extracted

In [47]:
print(f"Step 2: Splitting text into manageable chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_text(full_text)
print(f"Total chunks created: {len(chunks)}")
print(f"First chunk preview:\n{chunks[0][:300]}...")



Step 2: Splitting text into manageable chunks...
Total chunks created: 42
First chunk preview:
A Bloomberg Professional Services Offering
A Bloomberg Professional Services Offering
 Getting started on the 
Bloomberg Terminal.
Contents
02	 Bloomberg Terminal
06	 Functions & securities
09	 Navigation
13	 Performing analysis
17	 Exporting data
20	 Bloomberg Market Concepts (BMC)
21	 Getting help...


In [48]:
print(f"{Fore.YELLOW}Step 3: Initializing dataset storage...{Style.RESET_ALL}")

# Initialize empty dataset dictionary
dataset = {}

print(f"Dataset initialized. Will process {len(chunks)} chunks.")
print(f"Expected to generate ~{len(chunks) * 5} Q&A pairs total (5 per chunk)")

[33mStep 3: Initializing dataset storage...[0m
Dataset initialized. Will process 42 chunks.
Expected to generate ~210 Q&A pairs total (5 per chunk)


In [49]:
print(f"{Fore.YELLOW}Step 4: Generating Q&A pairs for each chunk...{Style.RESET_ALL}")

for i, chunk in enumerate(chunks):
    print(f"{Fore.MAGENTA}\n--- Processing Chunk {i+1}/{len(chunks)} ---{Style.RESET_ALL}")
    print(f"Chunk length: {len(chunk)} characters")
    print(f"Chunk preview: {chunk[:150]}...")

    prompt = create_prompt(chunk, num_records=5)

    try:
        data = llm_call(
            model=generator_model,
            prompt=prompt,
            json_schema=json.dumps(GeneratedData.model_json_schema())
        )
        
        # Check if data was generated successfully
        if data and "generated" in data:
            dataset[i] = {
                "generated": data["generated"], 
                "context": chunk
            }
            print(f"{Fore.GREEN}✅ Successfully generated {len(data['generated'])} Q&A pairs for chunk {i+1}{Style.RESET_ALL}")
            
            # Show a preview of generated Q&A pairs
            for j, qa_pair in enumerate(data["generated"][:2]):  # Show first 2 pairs
                print(f"  Q{j+1}: {qa_pair.get('question', 'N/A')[:100]}...")
                print(f"  A{j+1}: {qa_pair.get('answer', 'N/A')[:100]}...")
        else:
            print(f"{Fore.RED}❌ Failed to generate Q&A for chunk {i+1}{Style.RESET_ALL}")
            
    except Exception as e:
        print(f"{Fore.RED}❌ Error processing chunk {i+1}: {str(e)}{Style.RESET_ALL}")


[33mStep 4: Generating Q&A pairs for each chunk...[0m
[35m
--- Processing Chunk 1/42 ---[0m
Chunk length: 982 characters
Chunk preview: A Bloomberg Professional Services Offering
A Bloomberg Professional Services Offering
 Getting started on the 
Bloomberg Terminal.
Contents
02	 Bloomb...
[36m--- Calling model: ollama/llama3.1:8b ---[39m


[96m{
[39m[96m [39m[96m "[39m[96mgenerated[39m[96m":[39m[96m [
[39m[96m   [39m[96m {
[39m[96m     [39m[96m "[39m[96mquestion[39m[96m":[39m[96m "[39m[96mWhat[39m[96m is[39m[96m the[39m[96m primary[39m[96m purpose[39m[96m of[39m[96m the[39m[96m Bloomberg[39m[96m for[39m[96m Education[39m[96m program[39m[96m?",
[39m[96m     [39m[96m "[39m[96manswer[39m[96m":[39m[96m "[39m[96mB[39m[96mloomberg[39m[96m for[39m[96m Education[39m[96m has[39m[96m been[39m[96m committed[39m[96m to[39m[96m helping[39m[96m universities[39m[96m and[39m[96m colleges[39m[96m incorporate[39m[96m the[39m[96m Bloomberg[39m[96m Terminal[39m[96m into[39m[96m their[39m[96m academic[39m[96m programs[39m[96m to[39m[96m better[39m[96m prepare[39m[96m students[39m[96m for[39m[96m the[39m[96m global[39m[96m job[39m[96m market[39m[96m."
[39m[96m   [39m[96m },
[39m[96m   [39m[96m {
[39m[96m     [39m[

In [50]:
# Show final statistics
successful_chunks = len([k for k, v in dataset.items() if v.get("generated")])
total_qa_pairs = sum(len(v.get("generated", [])) for v in dataset.values())


print(f"\n{Fore.GREEN}Generation complete!{Style.RESET_ALL}")
print(f"Successfully processed: {successful_chunks}/{len(chunks)} chunks")
print(f"Total Q&A pairs generated: {total_qa_pairs}")


[32mGeneration complete![0m
Successfully processed: 20/42 chunks
Total Q&A pairs generated: 85
