In [1]:
!pip install pdfplumber

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: Optional[str] = None
    item_description: Dict[str, Any] = None  # For structured item details
    estimated_value: Optional[float] = None

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: Optional[str] = None
    address: Optional[str] = None
    injury_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: Optional[str] = None
    circle_subdivision: Optional[str] = None
    police_station: Optional[str] = None
    crime_no: Optional[str] = None
    fir_date: Optional[datetime] = None
    act_section: Optional[str] = None
    offense_date: Optional[datetime] = None
    offense_time_from: Optional[str] = None
    offense_time_to: Optional[str] = None
    location: Optional[str] = None
    distance_ps: Optional[str] = None
    complainant_name: Optional[str] = None
    complainant_age: Optional[int] = None
    complainant_religion: Optional[str] = None
    complainant_caste: Optional[str] = None
    complainant_occupation: Optional[str] = None
    phone_number: Optional[str] = None
    nationality: Optional[str] = None
    sex: Optional[str] = None
    complainant_address: Optional[str] = None
    victim_details: List[VictimDetails] = None
    property_details: List[PropertyDetails] = None
    total_property_value: Optional[float] = None

    def __post_init__(self):
        if self.victim_details is None:
            self.victim_details = []
        if self.property_details is None:
            self.property_details = []

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y') if self.fir_date else None}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y') if self.offense_date else None}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            if prop.item_description:
                print("  Item Description:")
                for key, value in prop.item_description.items():
                    print(f"    {key}: {value}")
            print(f"  Estimated Value: {f'Rs. {prop.estimated_value:,.2f}' if prop.estimated_value else None}")
        print(f"Total Property Value: {f'Rs. {self.total_property_value:,.2f}' if self.total_property_value else None}")
        print("==================\n")

def safe_search(pattern: str, text: str, group_num: int = 1) -> Optional[str]:
    """Helper function to safely perform regex search and return the matched group or None."""
    try:
        match = re.search(pattern, text)
        return match.group(group_num).strip() if match else None
    except (AttributeError, IndexError):
        return None

def safe_parse_date(date_str: Optional[str], format_str: str = "%d/%m/%Y") -> Optional[datetime]:
    """Safely parse date string to datetime object."""
    if not date_str:
        return None
    try:
        return datetime.strptime(date_str, format_str)
    except ValueError:
        return None

def safe_parse_int(value: Optional[str]) -> Optional[int]:
    """Safely parse string to integer."""
    if not value:
        return None
    try:
        return int(re.sub(r'[^\d]', '', value))
    except ValueError:
        return None

def safe_parse_float(amount_str: Optional[str]) -> Optional[float]:
    """Safely parse string to float, handling commas in numbers."""
    if not amount_str:
        return None
    try:
        return float(amount_str.replace(',', ''))
    except ValueError:
        return None

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document with error handling"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            
            # Basic FIR details
            district = safe_search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text)
            circle = safe_search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text)
            ps = safe_search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text)
            crime_no = safe_search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text)
            
            # Dates and times
            fir_date_str = safe_search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text)
            fir_date = safe_parse_date(fir_date_str)
            
            offense_date_str = safe_search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
            offense_date = safe_parse_date(offense_date_str)
            
            time_from = safe_search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            time_to = safe_search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            
            # Location and distance
            location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
            distance_match = re.search(r"(?:(Tow(?:ards|ords)\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Tow(?:ards|ords)\s+\w+))", text, re.IGNORECASE)
            
            distance_ps = None
            if distance_match:
                direction = (distance_match.group(1) or distance_match.group(6) or "").strip()
                distance_value = (distance_match.group(2) or distance_match.group(4) or "").strip()
                distance_unit = (distance_match.group(3) or distance_match.group(5) or "").strip()
                if direction and distance_value:
                    distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()
            
            # Complainant details
            complainant_name = safe_search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text)
            complainant_age_str = safe_search(r"Age\s*:\s*(\d+)", text)
            complainant_age = safe_parse_int(complainant_age_str)
            complainant_religion = safe_search(r"Religion\s*:\s*([^:\n]+?)(?=\s*(?:\([e-z]\)|$))", text)
            complainant_caste = safe_search(r"Caste\s*:\s*([^:\n]+?)(?=\s*(?:\([f-z]\)|$))", text)
            complainant_occupation = safe_search(r"Occupation\s*:\s*([^\n]*)", text)
            phone_number = safe_search(r"Phone No\.\s*:\s*(\d+)", text)
            nationality = safe_search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text)
            sex = safe_search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text)
            
            # Address extraction
            address_match = safe_search(r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)", text)
            complainant_address = None
            if address_match:
                address_text = address_match
                sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*"
                address_text = re.sub(sex_pattern, "", address_text)
                complainant_address = " ".join(line.strip() for line in address_text.split('\n') if line.strip())
            
            # Property details
            property_details = []
            if "Automobile" in text:
                item_desc = {
                    "reg_no": safe_search(r"Reg No:?\s*([A-Za-z0-9]+)(?=\s*(?:\d+|Make|$))", text),
                    "make": safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", text),
                    "model": safe_search(r"Model:?\s*(\d+)", text),
                    "engine_no": safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", text),
                    "chassis_no": safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", text)
                }
                
                value_str = safe_search(r"Estimated Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
                estimated_value = safe_parse_float(value_str)
                
                property_details.append(PropertyDetails(1, "Automobile", item_desc, estimated_value))
            
            # Total value
            total_value_str = safe_search(r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
            total_property_value = safe_parse_float(total_value_str)
            
            act_section = safe_search(r"Act & Section\s*:\s*([^:\n]+)", text)
            
            return FIRData(
                district=district,
                circle_subdivision=circle,
                police_station=ps,
                crime_no=crime_no,
                fir_date=fir_date,
                act_section=act_section,
                offense_date=offense_date,
                offense_time_from=time_from,
                offense_time_to=time_to,
                location=location,
                distance_ps=distance_ps,
                complainant_name=complainant_name,
                complainant_age=complainant_age,
                complainant_religion=complainant_religion,
                complainant_caste=complainant_caste,
                complainant_occupation=complainant_occupation,
                phone_number=phone_number,
                nationality=nationality,
                sex=sex,
                complainant_address=complainant_address,
                victim_details=[],
                property_details=property_details,
                total_property_value=total_property_value
            )
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return FIRData()  # Return empty FIRData object if processing fails

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("E:/sample FIRs/output/output_1761_fir_0340.pdf")
    fir.print_details()


=== FIR Details ===
District: Bengaluru City
Circle/Sub-Division: Central Traffic Sub Division
Police Station: Ashoknagar Traffic PS
Crime Number: 0340/2023
FIR Date: 29/10/2023
Act & Section: IPC 1860 (U/s-279); INDIAN MOTOR VEHICLES ACT, 1988

Offense Details:
Date: 28/10/2023
Time: 20:00:00 to 20:05:00
Location: NEAR APERA JUNCTION, Residency Road, Ashoknagar,Bengaluru City ,
Karnataka, 560025
Distance from Police Station: towards west, 1

Complainant Details:
Name: Arathi
Age: 51
Religion: Hindu
Caste: CHUNAR
Occupation: Housewife
Phone: 9980589350
Nationality: India
Sex: Female
Address: A-404, Royal Residency, Bruntow Road,Opp M.G Road , Bengaluru City , Karnataka-56002 5

Property Details:
- Type: Automobile
  Item Description:
    reg_no: KA05AF1820
    make: AUTO RICKSHAW
    model: None
    engine_no: -
    chassis_no: - 2 Automobile Reg No: KA04MZ9653 200000
  Estimated Value: Rs. 1.00
Total Property Value: Rs. 250,000.00



In [3]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime
import pandas as pd
import os
from pathlib import Path


def create_flat_dict(fir: FIRData) -> Dict[str, Any]:
    """Convert FIR data to a flat dictionary suitable for CSV export"""
    flat_dict = {
        'district': fir.district,
        'circle_subdivision': fir.circle_subdivision,
        'police_station': fir.police_station,
        'crime_no': fir.crime_no,
        'fir_date': fir.fir_date.strftime('%d/%m/%Y') if fir.fir_date else None,
        'act_section': fir.act_section,
        'offense_date': fir.offense_date.strftime('%d/%m/%Y') if fir.offense_date else None,
        'offense_time_from': fir.offense_time_from,
        'offense_time_to': fir.offense_time_to,
        'location': fir.location,
        'distance_ps': fir.distance_ps,
        'complainant_name': fir.complainant_name,
        'complainant_age': fir.complainant_age,
        'complainant_religion': fir.complainant_religion,
        'complainant_caste': fir.complainant_caste,
        'complainant_occupation': fir.complainant_occupation,
        'phone_number': fir.phone_number,
        'nationality': fir.nationality,
        'sex': fir.sex,
        'complainant_address': fir.complainant_address,
        'total_property_value': fir.total_property_value
    }

    # Add property details
    if fir.property_details:
        for i, prop in enumerate(fir.property_details, 1):
            prefix = f'property_{i}_'
            flat_dict[f'{prefix}type'] = prop.property_type
            flat_dict[f'{prefix}value'] = prop.estimated_value
            if prop.item_description:
                for key, value in prop.item_description.items():
                    flat_dict[f'{prefix}{key}'] = value

    if fir.victim_details:
        for i, victim in enumerate(fir.victim_details, 1):
            prefix = f'victim_{i}_'
            flat_dict[f'{prefix}name'] = victim.name
            flat_dict[f'{prefix}address'] = victim.address
            flat_dict[f'{prefix}injury_type'] = victim.injury_type
            flat_dict[f'{prefix}sex'] = victim.sex
            flat_dict[f'{prefix}age'] = victim.age
            flat_dict[f'{prefix}occupation'] = victim.occupation

    return flat_dict

def process_fir_files(input_folder: str, output_file: str):
    """
    Process all FIR PDF files in a folder and create a CSV dataset
    
    Args:
        input_folder (str): Path to folder containing FIR PDFs
        output_file (str): Path where CSV file should be saved
    """
    pdf_files = list(Path(input_folder).glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {input_folder}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    

    all_data = []
    failed_files = []
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"Processing file {i}/{len(pdf_files)}: {pdf_file.name}")
        try:
            fir = extract_fir_data(str(pdf_file))
            flat_data = create_flat_dict(fir)
            flat_data['file_name'] = pdf_file.name 
            all_data.append(flat_data)
        except Exception as e:
            print(f"Failed to process {pdf_file.name}: {str(e)}")
            failed_files.append(pdf_file.name)
    
    if not all_data:
        print("No data was successfully extracted")
        return

    df = pd.DataFrame(all_data)
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
 
    df.to_csv(output_file, index=False, encoding='utf-8')
    

    print("\nProcessing Summary:")
    print(f"Total files processed: {len(pdf_files)}")
    print(f"Successfully processed: {len(all_data)}")
    print(f"Failed to process: {len(failed_files)}")
    
    if failed_files:
        print("\nFailed files:")
        for file in failed_files:
            print(f"- {file}")
    

    print("\nDataset Statistics:")
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    
    key_columns = ['district', 'police_station', 'crime_no']
    print("\nUnique values in key columns:")
    for col in key_columns:
        if col in df.columns:
            unique_count = df[col].nunique()
            print(f"{col}: {unique_count} unique values")

if __name__ == "__main__":
    input_folder = "E:/sample FIRs/output"  
    output_file = "E:/sample FIRs/output/fir_dataset1.csv"  
    
    process_fir_files(input_folder, output_file)

Found 809 PDF files to process
Processing file 1/809: output_1367_fir_0153.pdf
Processing file 2/809: output_1761_fir_0022.pdf
Processing file 3/809: output_1761_fir_0232.pdf
Processing file 4/809: output_1761_fir_0334.pdf
Processing file 5/809: output_1761_fir_0335.pdf
Processing file 6/809: output_1761_fir_0340.pdf
Processing file 7/809: output_1761_fir_0351.pdf
Processing file 8/809: output_1761_fir_0354.pdf
Error processing PDF: No /Root object! - Is this really a PDF?
Processing file 9/809: output_1761_fir_0358.pdf
Processing file 10/809: output_1761_fir_0359.pdf
Processing file 11/809: output_1761_fir_0361.pdf
Processing file 12/809: output_1763_fir_0067.pdf
Error processing PDF: No /Root object! - Is this really a PDF?
Processing file 13/809: output_1763_fir_0073.pdf
Processing file 14/809: output_1763_fir_0178.pdf
Processing file 15/809: output_1763_fir_0329.pdf
Processing file 16/809: output_1763_fir_0331.pdf
Processing file 17/809: output_1763_fir_0341.pdf
Processing file 18/

In [26]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: Optional[str] = None
    item_description: Dict[str, Any] = None  
    estimated_value: Optional[float] = None

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: Optional[str] = None
    address: Optional[str] = None
    injury_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class AccusedDetails:
    """Data class for accused person information"""
    sl_no: int
    name: Optional[str] = None
    father_name: Optional[str] = None  # Details after the '/'
    person_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: Optional[str] = None
    circle_subdivision: Optional[str] = None
    police_station: Optional[str] = None
    crime_no: Optional[str] = None
    fir_date: Optional[datetime] = None
    act_section: Optional[str] = None
    offense_date: Optional[datetime] = None
    offense_time_from: Optional[str] = None
    offense_time_to: Optional[str] = None
    location: Optional[str] = None
    distance_ps: Optional[str] = None
    complainant_name: Optional[str] = None
    complainant_age: Optional[int] = None
    complainant_religion: Optional[str] = None
    complainant_caste: Optional[str] = None
    complainant_occupation: Optional[str] = None
    phone_number: Optional[str] = None
    nationality: Optional[str] = None
    sex: Optional[str] = None
    complainant_address: Optional[str] = None
    victim_details: List[VictimDetails] = None
    property_details: List[PropertyDetails] = None
    total_property_value: Optional[float] = None
    accused_details: List[AccusedDetails] = None

    def __post_init__(self):
        if self.victim_details is None:
            self.victim_details = []
        if self.property_details is None:
            self.property_details = []
        if self.accused_details is None:
            self.accused_details = []

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y') if self.fir_date else None}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y') if self.offense_date else None}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            if prop.item_description:
                print("  Item Description:")
                for key, value in prop.item_description.items():
                    print(f"    {key}: {value}")
            print(f"  Estimated Value: {f'Rs. {prop.estimated_value:,.2f}' if prop.estimated_value else None}")
        print(f"Total Property Value: {f'Rs. {self.total_property_value:,.2f}' if self.total_property_value else None}")

        print("\nAccused Details:")
        for accused in self.accused_details:
            print(f"- Accused {accused.sl_no}:")
            print(f"  Name: {accused.name}")
            print(f"  Father/Details: {accused.father_name}")
            print(f"  Person Type: {accused.person_type}")
            print(f"  Sex: {accused.sex}")
            print(f"  Age: {accused.age}")
            print(f"  Occupation: {accused.occupation}")
        print("==================\n")

def safe_search(pattern: str, text: str, group_num: int = 1) -> Optional[str]:
    """Helper function to safely perform regex search and return the matched group or None."""
    try:
        match = re.search(pattern, text)
        return match.group(group_num).strip() if match else None
    except (AttributeError, IndexError):
        return None

def safe_parse_date(date_str: Optional[str], format_str: str = "%d/%m/%Y") -> Optional[datetime]:
    """Safely parse date string to datetime object."""
    if not date_str:
        return None
    try:
        return datetime.strptime(date_str, format_str)
    except ValueError:
        return None

def safe_parse_int(value: Optional[str]) -> Optional[int]:
    """Safely parse string to integer."""
    if not value:
        return None
    try:
        return int(re.sub(r'[^\d]', '', value))
    except ValueError:
        return None

def safe_parse_float(amount_str: Optional[str]) -> Optional[float]:
    """Safely parse string to float, handling commas in numbers."""
    if not amount_str:
        return None
    try:
        return float(amount_str.replace(',', ''))
    except ValueError:
        return None

def extract_property_details(text: str) -> List[PropertyDetails]:
    properties = []
    
    # Find all property entries using registration numbers as anchor points
    reg_numbers = re.finditer(r"Reg No:\s*(KA\d+[A-Z]+\d+)", text)
    reg_positions = [(m.group(1), m.start()) for m in reg_numbers]
    
    if not reg_positions:
        return properties
    
    for idx, (reg_no, start_pos) in enumerate(reg_positions):
        end_pos = reg_positions[idx + 1][1] if idx < len(reg_positions) - 1 else len(text)
        property_text = text[start_pos:end_pos]
        
        item_desc = {
            "reg_no": reg_no,
            "make": safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", property_text),
            "model": safe_search(r"Model:?\s*([^\n]+?)(?=\s*Engine|$)", property_text),
            "engine_no": safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", property_text),
            "chassis_no": safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", property_text)
        }
        
        value_str = safe_search(r"Estimated Value\s*\([^)]*\)\s*\n*\s*([\d,]+)", text)
        estimated_value = safe_parse_float(value_str) or 0
        
        properties.append(PropertyDetails(
            sl_no=idx + 1,
            property_type="Automobile",
            item_description=item_desc,
            estimated_value=estimated_value
        ))
    
    return properties

def extract_accused_details(text: str) -> List[AccusedDetails]:
    """Extract accused details from the FIR text"""
    accused_list = []
    
    # First find all accused entries
    accused_matches = list(re.finditer(r'(\w+)\s*\(A(\d+)\)\s*/\s*([^\n]+)', text))
    
    for idx, match in enumerate(accused_matches):
        name = match.group(1).strip()
        sl_no = int(match.group(2))
        first_line = match.group(3).strip()
        
        next_pos = accused_matches[idx + 1].start() if idx + 1 < len(accused_matches) else len(text)
        full_text = text[match.start():next_pos].strip()
        
        lines = full_text.split('\n')
        
        main_details = first_line
        father_details = main_details.split("Accused")[0].strip()
        
        person_type = safe_search(r"Accused\s+(\w+)", main_details)
        sex = safe_search(r"(?:Adult|Child|Common|Common man)\s+(\w+)", main_details)
        age_str = safe_search(r"(?:Male|Female)\s+(\d+)", main_details)
        age = safe_parse_int(age_str)
        occupation = safe_search(r"(?:Male|Female)\s+\d+\s+(.+?)$", main_details)
        
        additional_lines = []
        for line in lines[1:]: 
            line = line.strip()
            if line and not re.search(r'\(A\d+\)', line):  
                additional_lines.append(line)
     
        full_details = father_details
        if additional_lines:
            full_details += ", " + " ".join(additional_lines)

        if "7. Details of Victims" in full_details:
            full_details = full_details.split("7. Details of Victims")[0].strip()
        
        accused_list.append(AccusedDetails(
            sl_no=sl_no,
            name=name,
            father_name=full_details,
            person_type=person_type,
            sex=sex,
            age=age,
            occupation=occupation
        ))
    
    return accused_list
        

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document with error handling"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            
            # Basic FIR details
            district = safe_search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text)
            circle = safe_search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text)
            ps = safe_search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text)
            crime_no = safe_search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text)
            
            # Dates and times
            fir_date_str = safe_search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text)
            fir_date = safe_parse_date(fir_date_str)
            
            offense_date_str = safe_search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
            offense_date = safe_parse_date(offense_date_str)
            
            time_from = safe_search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            time_to = safe_search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            
            # Location and distance
            location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
            distance_match = re.search(r"(?:(Tow(?:ards|ords)\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Tow(?:ards|ords)\s+\w+))", text, re.IGNORECASE)
            
            distance_ps = None
            if distance_match:
                direction = (distance_match.group(1) or distance_match.group(6) or "").strip()
                distance_value = (distance_match.group(2) or distance_match.group(4) or "").strip()
                distance_unit = (distance_match.group(3) or distance_match.group(5) or "").strip()
                if direction and distance_value:
                    distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()
            
            # Complainant details
            complainant_name = safe_search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text)
            complainant_age_str = safe_search(r"Age\s*:\s*(\d+)", text)
            complainant_age = safe_parse_int(complainant_age_str)
            complainant_religion = safe_search(r"Religion\s*:\s*([^:\n]+?)(?=\s*(?:\([e-z]\)|$))", text)
            complainant_caste = safe_search(r"Caste\s*:\s*([^:\n]+?)(?=\s*(?:\([f-z]\)|$))", text)
            complainant_occupation = safe_search(r"Occupation\s*:\s*([^\n]*)", text)
            phone_number = safe_search(r"Phone No\.\s*:\s*(\d+)", text)
            nationality = safe_search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text)
            sex = safe_search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text)
            
            # Address extraction
            address_match = safe_search(r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)", text)
            complainant_address = None
            if address_match:
                address_text = address_match
                sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*"
                address_text = re.sub(sex_pattern, "", address_text)
                complainant_address = " ".join(line.strip() for line in address_text.split('\n') if line.strip())
            
            # Property details
            property_details = extract_property_details(text)
            
            # Total value
            total_value_str = safe_search(r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
            total_property_value = safe_parse_float(total_value_str)
            
            act_section = safe_search(r"Act & Section\s*:\s*([^:\n]+)", text)

            accused_details = extract_accused_details(text)
            
            return FIRData(
                district=district,
                circle_subdivision=circle,
                police_station=ps,
                crime_no=crime_no,
                fir_date=fir_date,
                act_section=act_section,
                offense_date=offense_date,
                offense_time_from=time_from,
                offense_time_to=time_to,
                location=location,
                distance_ps=distance_ps,
                complainant_name=complainant_name,
                complainant_age=complainant_age,
                complainant_religion=complainant_religion,
                complainant_caste=complainant_caste,
                complainant_occupation=complainant_occupation,
                phone_number=phone_number,
                nationality=nationality,
                sex=sex,
                complainant_address=complainant_address,
                victim_details=[],
                property_details=property_details,
                total_property_value=total_property_value,
                accused_details=accused_details
            )
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return FIRData()  # Return empty FIRData object if processing fails

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("G:/iisc/combined/fir_0009.pdf")
    fir.print_details()


=== FIR Details ===
District: Bengaluru City
Circle/Sub-Division: South East Traffic Sub Divisio
Police Station: Adugodi Traffic PS
Crime Number: 0004/2024
FIR Date: 09/02/2024
Act & Section: IPC 1860 (U/s-283)

Offense Details:
Date: 09/02/2024
Time: 15:55:00 to 16:05:00
Location: 60 feet Road,, Koramangala,Bengaluru City , Karnataka, 560095
Distance from Police Station: towards east, 1.5 KM

Complainant Details:
Name: Annappa Kattimani
Age: 34
Religion: Hindu
Caste: BUDAGA JANGAMA
Occupation: Police officer
Phone: None
Nationality: India
Sex: Male
Address: CPC12207,Adug odi Traffic Police station,Adugodi , Bengaluru City , Karnataka-56003 0

Property Details:
- Type: Automobile
  Item Description:
    reg_no: KA05JM5723
    make: SCOOTER
    model: -
    engine_no: -
    chassis_no: -
  Estimated Value: Rs. 1.00
Total Property Value: None

Accused Details:
- Accused 1:
  Name: vikas
  Father/Details: Janakaraj, ,#525, 3rd main, nanjappa reddy layout,koramnagalaBengaluru City, Karnat