# Turn any pdf to Q&A dataset for finetunning llm

- I used locally runned llama3-instruct-8b with the help of lmstudio instead of openai api

## Installing the libraries


In [None]:
%%capture
!pip install PyMuPDF
!pip install pytesseract
!pip install PIL
!pip install openai

## PDF to text

In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io


def convert_pdf_to_text(pdf_path ):
    # Open the PDF file
    document = fitz.open(pdf_path)

    text = ""  # Initialize a text string to hold all text from the PDF

    for page_num in range(len(document)):
        # Get the page
        page = document.load_page(page_num)

        # First, try to extract text using PyMuPDF
        text_content = page.get_text()

        if text_content.strip():  # If text is found, append it.
            text += text_content
        else:
            # If no text is found, it might be an image-based PDF
            # Extract the image from the page
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = document.extract_image(xref)
                image_bytes = base_image["image"]

                # Load it to PIL
                image = Image.open(io.BytesIO(image_bytes))

                # Use pytesseract to do OCR on the image
                text += pytesseract.image_to_string(image)

    # Close the document
    document.close()

    text = text.strip()
    text = " ".join(text.split())


    return text

In [None]:
# Usage
pdf_path = "/content/Constitution-of-Nepal.pdf"

In [None]:
text = convert_pdf_to_text(pdf_path )

In [None]:
print(text[:100])

# Main body: text to Q&A response.json

In [None]:
# Chat with an intelligent assistant in your terminal
from openai import OpenAI

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

history = [
    {"role": "system", "content": "You are an API that converts bodies of text into a single question and answer into a JSON format from the text provided by user. Each JSON contains a single question with a single answer. Only respond with the JSON and no additional text."},
]

def run(user_input):
    history.append({"role": "user", "content": user_input})
    if (len(history)>4):
        query = history[:2] + history[-2:]
    else:
        query = history

    completion = client.chat.completions.create(
        model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
        messages=query,
        temperature=0.7,
        stream=True,
    )

    print(completion)
    new_message = {"role": "assistant", "content": ""}

    for chunk in completion:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)
            new_message["content"] += chunk.choices[0].delta.content

    history.append(new_message)

    return new_message["content"]


In [None]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def is_json(data):
    try:
        json.loads(data)
        return True
    except ValueError:
        return False

def submit_to_api(chunk, retries=3):
    for i in range(retries):
        try:
            response = run(chunk)
            # Extract JSON string from between back-ticks
            if is_json(response):
                print(response)
                return json.loads(response)
            else:
                match = re.search(r'`(.*?)`', response, re.S)
                if match and is_json(match.group(1)):
                    print(f"Attempt {i + 1} failed. Retrying...")
                    return json.loads(match.group(1))  # assuming you want to return the JSON data
                else:
                    print(f"Request failed: {e}")
        except requests.exceptions.RequestException as e:
            continue
    print("Max retries exceeded. Skipping this chunk.")
    return None


all_chunks = list(chunks(text, 50))
# print(token_chunks)

responses = []

for chunk in all_chunks:
    response = submit_to_api(chunk)
    if response is not None:
        print("hello")
        responses.append(response)

# Write responses to a JSON file
with open('response.json', 'w') as f:
    json.dump(responses, f)