In [None]:
import pdfplumber
from openpyxl import Workbook

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, excluding the first line, 
    and dividing sections based on 'Objection' and 'Response' patterns followed by a number and semi-colon.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF as the header (excluding the first line)
        first_page = pdf.pages[0]
        header = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Objection' and 'Response' patterns
        sections = full_text.split("Objection ")

        for section in sections[1:]:  # Start from the second section as the first one will not contain questions
            # Find the index of "Response" to split into question and answer
            response_index = section.find("Response")
            if response_index != -1:
                # Extract question and answer
                question = section[:response_index].strip()
                answer = section[response_index:].strip()
                # Include filename, header, question, and answer in each row
                sheet.append([filename, header, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = r"D:\Scholarship\Objection database code\LA GL SERF 1234_v4.pdf"
excel_path = r"D:\Scholarship\Objection database code\test1.xlsx"

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)