In [14]:
import fitz  # PyMuPDF
import re
import json
import random
import nltk
from openai import OpenAI
from markdownify import markdownify as md
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from autogen_ext.models.openai import AzureOpenAIChatCompletionClient
import os
from dotenv import load_dotenv
from pathlib import Path
from autogen_core.models import ModelInfo
from autogen_ext.models.openai import OpenAIChatCompletionClient
load_dotenv()
from autogen_agentchat.agents import AssistantAgent
from autogen_core import CancellationToken
from autogen_core.models import UserMessage
from autogen_agentchat.messages import MultiModalMessage
from pydantic import BaseModel
from autogen_agentchat.messages import TextMessage
from typing import List
gemini_api_key = os.environ["GEMINI_API_KEY"]



In [15]:
def extract_text_from_folder(folder_path):
    
    full_text =""
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file_name)
            print(f"📄 Processing: {file_name}")
            
            doc = fitz.open(pdf_path)
            full_text += "\n".join(page.get_text() for page in doc)
            print(f"✅ Extracted from {file_name}: {len(full_text)} characters.")
            
            
    
    return full_text

def clean_text(text):
    # Merge broken lines
    text = re.sub(r'\n(?=[a-z])', ' ', text)  # join words split by line breaks
    # Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    return text.strip()
    
# Step 2: Chunk text into sections
def chunk_text(text, min_words=200, max_words=400):
    paragraphs = re.split(r"\n\s*\n", text)
    chunks = []
    for p in paragraphs:
        words = p.split()
        if min_words <= len(words) <= max_words:
            chunks.append(p.strip())
    return chunks

In [16]:
async def generate_qa_from_chunk(chunk, agent):
    text_message = TextMessage(content=chunk, source="User")
    result = await agent.on_messages(
            [text_message], 
            cancellation_token=CancellationToken()
        )
    response = result.chat_message.content
    response = response.model_dump()
    return response



In [17]:
import asyncio


async def build_dataset(chunks, client):
    dataset = []
    for chunk in chunks:
        #try:
            qas = await generate_qa_from_chunk(chunk, client)
            qas = qas["output"]
            
            
            for qa in qas:
                if isinstance(qa, str):
                    qas_parsed = json.loads(qa)
                else:
                    qas_parsed = qa
                dataset.append({
                    "context": chunk,
                    "question": qas_parsed["question"],
                    "answer": qas_parsed["answer"]
                })
                await asyncio.sleep(10)
        # except Exception as e:
        #     print("Error processing chunk:", e)
    return dataset

In [18]:
def split_dataset(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1"
    
    random.shuffle(data)
    
    n_total = len(data)
    n_train = int(n_total * train_ratio)
    n_val = int(n_total * val_ratio)
    
    train_data = data[:n_train]
    val_data = data[n_train:n_train + n_val]
    test_data = data[n_train + n_val:]
    
    return train_data, val_data, test_data

# Step 6: Save to JSONL
def save_dataset(train, val, test,prefix="egypt_pdf_qa"):
    with open(f"{prefix}_train.jsonl", "a", encoding="utf-8") as f:
        for item in train:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    with open(f"{prefix}_val.jsonl", "a", encoding="utf-8") as f:
        for item in val:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    with open(f"{prefix}_test.jsonl", "a", encoding="utf-8") as f:
        for item in test:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print(f"✅ Saved {len(train)} train and {len(val)} validation samples.")


In [19]:
gemini_client = OpenAIChatCompletionClient(
        model="gemini-2.5-flash",
        api_key=gemini_api_key,
        model_info=ModelInfo(vision=True, function_calling=True, json_output=True, family="unknown", structured_output=True)
        )
    

In [20]:
class Qustion(BaseModel):
    question: str
    answer : str

In [21]:

class output(BaseModel):
    output : List[Qustion]

In [22]:
history_Specialist = AssistantAgent(
    name="history_Speciallist_agent",
    model_client=gemini_client,
    system_message="""You are a history expert assistant. From the paragraph , generate 2–3 factual question–answer pairs.
    -only make qustions relate to history.
    """
    ,
    output_content_type=output
    )

In [23]:

pdf_path = "pdf_data"  # Path to your local PDF

text = extract_text_from_folder(pdf_path)
text = clean_text(text)




📄 Processing: 05. A Short History of Egypt – to About 1970 author Aerospace Computing Lab.pdf
✅ Extracted from 05. A Short History of Egypt – to About 1970 author Aerospace Computing Lab.pdf: 149972 characters.


In [24]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,  # characters or tokens approx
    chunk_overlap=500
)
chunks = splitter.split_text(text)
print(f"📄 Extracted {len(chunks)} chunks from PDF")

📄 Extracted 78 chunks from PDF


In [12]:
chunks[100]

'Women  had rights  that many of their  ancient counterparts  didn’t, including the  right to own property  and demand  divorce\nScientists were unable to confirm  they had found Hatshepsut’s  mummy until they identified one  of her teeth from a separate find\nThe Mortuary Temple of Hatshepsut  is located on the west bank of the \nNile, in the Valley of the Kings\n67\nHer stepson Thutmose III went on to rule  for a further 30 years, proving to be a similarly  ambitious builder and a mighty warrior. He led 17  campaigns in enemy-held territory, and conquered  land as far north as Syria and as far south as the \nFourth Cataract of the Nile. Meanwhile, the relics  of Hatshepsut’s reign continued to stand proud on  the Egyptian skyline, her towering obelisks and  imposing statues casting a shadow in her memory  upon the land she once called hers. \nHowever, towards the end of Thutmose’s regency,  he ordered that his stepmother’s cartouches and  images be chiselled away, and her statues tor

In [13]:
async def process_in_batches(chunks, client, batch_size=50):
    for start in range(160, len(chunks), batch_size):
        end = start + batch_size
        batch_chunks = chunks[start:end]
        
        print(f"\nProcessing chunks {start} to {end}...")
        dataset = await build_dataset(batch_chunks, client)
        
        train, val, test = split_dataset(dataset)
        save_dataset(train, val, test)
        
        print(f"✅ Saved dataset for chunks {start} to {end}")

# Example run
await process_in_batches(chunks, history_Specialist,10)



Processing chunks 160 to 170...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 160 to 170

Processing chunks 170 to 180...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 170 to 180

Processing chunks 180 to 190...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 180 to 190

Processing chunks 190 to 200...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 190 to 200

Processing chunks 200 to 210...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 200 to 210

Processing chunks 210 to 220...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 210 to 220

Processing chunks 220 to 230...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 220 to 230

Processing chunks 230 to 240...
✅ Saved 21 train and 4 validation samples.
✅ Saved dataset for chunks 230 to 240

Processing chunks 240 to 250...
✅ Saved 8 train and 1 validation samples.
✅ Saved datas