In [None]:

import pdfplumber  # Importing the pdfplumber library for PDF processing
import openpyxl  # Importing the openpyxl library for Excel processing
from openpyxl import Workbook  # Importing Workbook from openpyxl for creating Excel workbooks

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, including the PDF name and first line as headers.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF
        first_page = pdf.pages[0]
        first_line = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Question' and 'Answer'
        sections = full_text.split('Question')
        for section in sections[1:]:  # Skip the first split as it's before the first 'Question'
            question, answer = section.split('Answer', 1)  # Split only on the first 'Answer'
            question = 'Question' + question.strip()
            answer = answer.strip()
            # Include filename, first line, question, and answer in each row
            sheet.append([filename, first_line, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = '/mnt/data/LA GL SERF 1234_v4.pdf'
excel_path = '/mnt/data/extracted_qa_with_headers.xlsx'

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)
