In [1]:
# Import libraries for PDF form field extraction
import PyPDF2
import pdfplumber
import pandas as pd

# Path to your PDF with form fields
path = r"C:\Users\enfxm\Desktop\Python Testing\Test Report Template\Test-Report-Generator\fourth_i_guess\pdf_form_test1_filled.pdf"

print("Libraries imported successfully!")
print(f"PDF path: {path}")



Libraries imported successfully!
PDF path: C:\Users\enfxm\Desktop\Python Testing\Test Report Template\Test-Report-Generator\fourth_i_guess\pdf_form_test1_filled.pdf


In [2]:
# Method 1: Using PyPDF2 to extract form field data
def extract_form_fields_pypdf2(pdf_path):
    """Extract form field data using PyPDF2"""
    form_data = {}
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Check if the PDF has form fields
            if pdf_reader.is_encrypted:
                print("PDF is encrypted. Cannot extract form fields.")
                return None
            
            # Extract form field data
            if '/AcroForm' in pdf_reader.trailer['/Root']:
                acro_form = pdf_reader.trailer['/Root']['/AcroForm']
                if '/Fields' in acro_form:
                    fields = acro_form['/Fields']
                    for field in fields:
                        field_obj = field.get_object()
                        if '/T' in field_obj:  # Field name
                            field_name = field_obj['/T']
                            field_value = field_obj.get('/V', 'No value')
                            form_data[field_name] = field_value
                            print(f"Field: {field_name} = {field_value}")
            else:
                print("No form fields found in this PDF.")
                
    except Exception as e:
        print(f"Error extracting form fields: {e}")
    
    return form_data

# Extract form field data
form_data = extract_form_fields_pypdf2(path)
print("\nForm field data extracted:")
print(form_data)

Field: Dropdown1 = Nominal High
Field: Text2 = Text!
Field: Signature3_es_:signer:signature = Frankie
Field: Group4 = /1
Field: Check Box5 = /Yes
Field: Check Box6 = /Yes
Field: Check Box7 = /Off
Field: Check Box8 = /Yes
Field: Check Box9 = No value
Field: Date10_es_:signer:date = date?

Form field data extracted:
{'Dropdown1': 'Nominal High', 'Text2': 'Text!', 'Signature3_es_:signer:signature': 'Frankie', 'Group4': '/1', 'Check Box5': '/Yes', 'Check Box6': '/Yes', 'Check Box7': '/Off', 'Check Box8': '/Yes', 'Check Box9': 'No value', 'Date10_es_:signer:date': 'date?'}


In [3]:
# Method 2: Using pdfplumber (alternative approach)
def extract_form_fields_pdfplumber(pdf_path):
    """Extract form field data using pdfplumber"""
    form_data = {}
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                print(f"\nPage {page_num + 1}:")
                
                # Extract text that might contain form data
                text = page.extract_text()
                if text:
                    print("Extracted text:")
                    print(text[:500] + "..." if len(text) > 500 else text)
                
                # Try to extract form annotations
                if hasattr(page, 'annots') and page.annots:
                    for annot in page.annots:
                        if 'contents' in annot:
                            form_data[f'annotation_{len(form_data)}'] = annot['contents']
                
    except Exception as e:
        print(f"Error with pdfplumber: {e}")
    
    return form_data

# Try the alternative method
print("\n" + "="*50)
print("Method 2: Using pdfplumber")
print("="*50)
pdfplumber_data = extract_form_fields_pdfplumber(path)


Method 2: Using pdfplumber

Page 1:


In [4]:
# Method 3: Comprehensive form field extraction
def extract_all_form_data(pdf_path):
    """
    Comprehensive method to extract various types of form field data
    including text fields, checkboxes, radio buttons, and dropdowns
    """
    
    print("="*60)
    print("COMPREHENSIVE PDF FORM FIELD EXTRACTION")
    print("="*60)
    
    form_data = {
        'text_fields': {},
        'checkboxes': {},
        'radio_buttons': {},
        'dropdowns': {},
        'other_fields': {}
    }
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            print(f"PDF has {len(pdf_reader.pages)} pages")
            
            # Check for form fields
            if pdf_reader.is_encrypted:
                print("❌ PDF is encrypted")
                return form_data
            
            # Look for AcroForm
            if '/AcroForm' in pdf_reader.trailer['/Root']:
                acro_form = pdf_reader.trailer['/Root']['/AcroForm']
                print("✅ AcroForm found")
                
                if '/Fields' in acro_form:
                    fields = acro_form['/Fields']
                    print(f"📋 Found {len(fields)} form fields")
                    
                    for i, field in enumerate(fields):
                        try:
                            field_obj = field.get_object()
                            
                            # Extract field information
                            field_name = field_obj.get('/T', f'Field_{i}')
                            field_value = field_obj.get('/V', None)
                            field_type = field_obj.get('/FT', 'Unknown')
                            
                            print(f"\n📝 Field {i+1}:")
                            print(f"   Name: {field_name}")
                            print(f"   Type: {field_type}")
                            print(f"   Value: {field_value}")
                            
                            # Categorize by field type
                            if field_type == '/Tx':  # Text field
                                form_data['text_fields'][field_name] = field_value
                            elif field_type == '/Btn':  # Button (checkbox/radio)
                                if '/Opt' in field_obj:  # Has options (radio button)
                                    form_data['radio_buttons'][field_name] = field_value
                                else:  # Checkbox
                                    form_data['checkboxes'][field_name] = field_value
                            elif field_type == '/Ch':  # Choice (dropdown/listbox)
                                form_data['dropdowns'][field_name] = field_value
                            else:
                                form_data['other_fields'][field_name] = field_value
                                
                        except Exception as e:
                            print(f"Error processing field {i}: {e}")
                            
            else:
                print("❌ No AcroForm found in PDF")
                
    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
    
    return form_data

# Extract all form data
all_form_data = extract_all_form_data(path)

# Display results in a nice format
print("\n" + "="*60)
print("EXTRACTION RESULTS")
print("="*60)

for category, fields in all_form_data.items():
    if fields:
        print(f"\n{category.upper().replace('_', ' ')}:")
        for name, value in fields.items():
            print(f"  • {name}: {value}")
    else:
        print(f"\n{category.upper().replace('_', ' ')}: None found")

COMPREHENSIVE PDF FORM FIELD EXTRACTION
PDF has 1 pages
✅ AcroForm found
📋 Found 10 form fields

📝 Field 1:
   Name: Dropdown1
   Type: /Ch
   Value: Nominal High

📝 Field 2:
   Name: Text2
   Type: /Tx
   Value: Text!

📝 Field 3:
   Name: Signature3_es_:signer:signature
   Type: /Tx
   Value: Frankie

📝 Field 4:
   Name: Group4
   Type: /Btn
   Value: /1

📝 Field 5:
   Name: Check Box5
   Type: /Btn
   Value: /Yes

📝 Field 6:
   Name: Check Box6
   Type: /Btn
   Value: /Yes

📝 Field 7:
   Name: Check Box7
   Type: /Btn
   Value: /Off

📝 Field 8:
   Name: Check Box8
   Type: /Btn
   Value: /Yes

📝 Field 9:
   Name: Check Box9
   Type: /Btn
   Value: None

📝 Field 10:
   Name: Date10_es_:signer:date
   Type: /Tx
   Value: date?

EXTRACTION RESULTS

TEXT FIELDS:
  • Text2: Text!
  • Signature3_es_:signer:signature: Frankie
  • Date10_es_:signer:date: date?

CHECKBOXES:
  • Check Box5: /Yes
  • Check Box6: /Yes
  • Check Box7: /Off
  • Check Box8: /Yes
  • Check Box9: None

RADIO BUTTONS:

In [5]:
# Convert extracted data to DataFrame for easier analysis
def form_data_to_dataframe(form_data):
    """Convert form field data to a pandas DataFrame"""
    
    # Flatten the nested dictionary
    flattened_data = []
    
    for category, fields in form_data.items():
        for field_name, field_value in fields.items():
            flattened_data.append({
                'Field_Name': field_name,
                'Field_Type': category,
                'Field_Value': field_value
            })
    
    if flattened_data:
        df = pd.DataFrame(flattened_data)
        return df
    else:
        return pd.DataFrame(columns=['Field_Name', 'Field_Type', 'Field_Value'])

# Create DataFrame from extracted data
df = form_data_to_dataframe(all_form_data)

print("\n" + "="*60)
print("DATA AS DATAFRAME")
print("="*60)
if not df.empty:
    print(df.to_string(index=False))
    
    # Export options
    print(f"\n📊 Found {len(df)} form fields")
    print("\n💾 Export options:")
    print("   df.to_csv('extracted_form_data.csv', index=False)")
    print("   df.to_excel('extracted_form_data.xlsx', index=False)")
    print("   df.to_json('extracted_form_data.json', orient='records')")
else:
    print("No form field data found to display.")


DATA AS DATAFRAME
                     Field_Name    Field_Type  Field_Value
                          Text2   text_fields        Text!
Signature3_es_:signer:signature   text_fields      Frankie
         Date10_es_:signer:date   text_fields        date?
                     Check Box5    checkboxes         /Yes
                     Check Box6    checkboxes         /Yes
                     Check Box7    checkboxes         /Off
                     Check Box8    checkboxes         /Yes
                     Check Box9    checkboxes         None
                         Group4 radio_buttons           /1
                      Dropdown1     dropdowns Nominal High

📊 Found 10 form fields

💾 Export options:
   df.to_csv('extracted_form_data.csv', index=False)
   df.to_excel('extracted_form_data.xlsx', index=False)
   df.to_json('extracted_form_data.json', orient='records')


# 📚 PDF Form Field Extraction Guide

## ✅ What We've Implemented

1. **PyPDF2 Method**: Extracts form field data directly from PDF AcroForms
2. **pdfplumber Method**: Alternative approach for text extraction
3. **Comprehensive Extraction**: Categorizes different field types (text, checkboxes, dropdowns, etc.)
4. **DataFrame Export**: Converts extracted data to pandas DataFrame for analysis

## 🔧 Alternative Tools and Libraries

If the above methods don't work with your specific PDF, try these alternatives:

### Python Libraries:
- **`pdfplumber`**: Better for text extraction and some form fields
- **`pymupdf` (fitz)**: More advanced PDF processing
- **`pdfminer3k`**: Low-level PDF parsing
- **`fillpdf`**: Specifically for PDF forms

### Installation commands:
```python
# Install alternative libraries
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Uncomment to install additional packages
# install_package("pymupdf")
# install_package("pdfminer3k")
# install_package("fillpdf")
```

## 🐛 Troubleshooting

**If no form fields are found:**
1. The PDF might not have interactive form fields
2. The PDF might be a scanned document (requires OCR)
3. The form fields might be image-based rather than text-based

**For scanned PDFs or image-based forms:**
- Use OCR libraries like `pytesseract` + `opencv-python`
- Consider cloud-based solutions like Google Cloud Vision API

## 📝 Next Steps

1. Run the cells above to extract data from your PDF
2. If successful, export the data using the DataFrame methods
3. If unsuccessful, try the alternative libraries mentioned above