In [None]:

import pdfplumber  # Importing the pdfplumber library for PDF processing
import openpyxl  # Importing the openpyxl library for Excel processing
from openpyxl import Workbook  # Importing Workbook from openpyxl for creating Excel workbooks

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, including the PDF name and first line as headers.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF
        first_page = pdf.pages[0]
        first_line = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Question' and 'Answer'
        sections = full_text.split('Objection')
        for section in sections[1:]:  # Skip the first split as it's before the first 'Question'
            question, answer = section.split('Response', 1)  # Split only on the first 'Answer'
            question = 'Objection' + question.strip()
            answer = answer.strip()
            # Include filename, first line, question, and answer in each row
            sheet.append([filename, first_line, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = (r"D:\Scholarship\Objection database code\LA GL SERF 1234_v5.pdf")
excel_path = (r"D:\Scholarship\Objection database code\test1.xlsx")

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)


In [1]:
import pdfplumber
from openpyxl import Workbook

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, including the PDF name and first line as headers.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF
        first_page = pdf.pages[0]
        first_line = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Objection' and 'Response' patterns
        sections = full_text.split("Objection")[1:]

        for section in sections:
            # Find the index of "Response" to split into question and answer
            response_index = section.find("Response")
            if response_index != -1:
                question = section[:response_index].strip()
                answer = section[response_index + len("Response"):].strip()
                # Include filename, first line, question, and answer in each row
                sheet.append([filename, first_line, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = (r"D:\Scholarship\Objection database code\LA GL SERF 1234_v4.pdf")
excel_path = (r"D:\Scholarship\Objection database code\test1.xlsx")

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)


In [2]:
import pdfplumber
from openpyxl import Workbook

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, excluding the first line, 
    and dividing sections based on 'Objection' and 'Response' patterns followed by a number and semi-colon.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF as the header (excluding the first line)
        first_page = pdf.pages[0]
        header = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Objection' and 'Response' patterns
        sections = full_text.split("Objection ")

        for section in sections[1:]:  # Start from the second section as the first one will not contain questions
            # Find the index of "Response" to split into question and answer
            response_index = section.find("Response")
            if response_index != -1:
                # Extract question and answer
                question = section[:response_index].strip()
                answer = section[response_index:].strip()
                # Include filename, header, question, and answer in each row
                sheet.append([filename, header, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = r"D:\Scholarship\Objection database code\LA GL SERF 1234_v4.pdf"
excel_path = r"D:\Scholarship\Objection database code\test1.xlsx"

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)


In [4]:
# working code for split of objection: & response:

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, including the PDF name and first line as headers.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF
        first_page = pdf.pages[0]
        first_line = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Objection:' and 'Response:'
        sections = full_text.split('Objection:')  # Split based on 'Objection:'
        for section in sections[1:]:  # Skip the first split as it's before the first 'Objection:'
            question, answer = section.split('Response:', 1)  # Split only on the first 'Response:'
            question = 'Objection:' + question.strip()
            answer = answer.strip()
            # Include filename, first line, question, and answer in each row
            sheet.append([filename, first_line, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = (r"D:\Scholarship\Objection database code\LA GL SERF 1234_v6.pdf")
excel_path = (r"D:\Scholarship\Objection database code\test1.xlsx")

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)


In [16]:
# Able to identify Objection 1: & Response 1:


import re  # Importing the regular expression module

def extract_qa_to_excel_with_headers(pdf_path, excel_path):
    """
    Extracts questions and answers from a PDF and writes them to an Excel file, including the PDF name and first line as headers.
    Args:
    pdf_path (str): Path to the source PDF file.
    excel_path (str): Path for the output Excel file.
    """
    # Extract the filename from the PDF path
    filename = pdf_path.split('/')[-1]

    # Open the PDF file with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Create a new Excel workbook and select the active sheet
        workbook = Workbook()
        sheet = workbook.active

        # Set column headers in the Excel file
        sheet.append(["File Name", "Header", "Question", "Answer"])

        # Extract the first line of the PDF
        first_page = pdf.pages[0]
        first_line = first_page.extract_text().split('\n')[0]

        full_text = ''
        # Combine text from all pages
        for page in pdf.pages:
            full_text += page.extract_text() + '\n'

        # Split the text into sections based on 'Objection' followed by a number and a colon
        sections = re.split(r'Objection\s+\d+:', full_text)
        for section in sections:  # Iterate through all splits
            # Find the position of 'Response' followed by a space and any digit
            match = re.search(r'Response\s+\d:', section)
            if match:
                # Extract question and answer based on the position of 'Response'
                question = section[:match.start()].strip()
                answer = section[match.end():].strip()
                # Include filename, first line, question, and answer in each row
                sheet.append([filename, first_line, question, answer])

        # Save the workbook
        workbook.save(excel_path)

# File paths for the original PDF and output Excel file
original_pdf_path = (r"D:\Scholarship\Objection database code\LA GL SERF 1234_v5.pdf")
excel_path = (r"D:\Scholarship\Objection database code\test1.xlsx")

# Extract questions and answers to Excel with headers
extract_qa_to_excel_with_headers(original_pdf_path, excel_path)


In [17]:
import sys
import subprocess

# Function to get Python version
def get_python_version():
    return sys.version

# Function to get installed packages
def get_installed_packages():
    try:
        installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']).decode().split('\n')
        return installed_packages
    except Exception as e:
        print("Error:", e)
        return []

# Function to generate requirements.txt
def generate_requirements_txt(file_path='requirements.txt'):
    installed_packages = get_installed_packages()
    with open(file_path, 'w') as f:
        for package in installed_packages:
            f.write(package + '\n')
    print("requirements.txt generated successfully.")

# Print Python version
print("Python version:", get_python_version())

# Print installed packages
print("Installed packages:")
installed_packages = get_installed_packages()
for package in installed_packages:
    print(package)

# Generate requirements.txt
generate_requirements_txt()


Python version: 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
Installed packages:
aiofiles==23.1.0
aiohttp==3.8.4
aiosignal==1.3.1
alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
altair==4.2.2
anaconda-client==1.7.2
anaconda-navigator==2.0.3
anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1610472525955/work
anyio==3.6.2
appdirs==1.4.4
argcomplete==1.10.3
argh==0.26.2
argon2-cffi @ file:///C:/ci/argon2-cffi_1613037959010/work
asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work
astroid @ file:///C:/ci/astroid_1613501047216/work
astropy @ file:///C:/ci/astropy_1617745647203/work
async-generator @ file:///home/ktietz/src/ci/async_generator_1611927993394/work
async-timeout==4.0.2
atomicwrites==1.4.0
attrs @ file:///tmp/build/80754af9/attrs_1604765588209/work
autopep8 @ file:///tmp/build/80754af9/autopep8_1615918855173/work
Babel @ file:///tmp/build/80754af9/babel_1607110387436/work
backcall @ file:///home/ktietz/src/c