In [45]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: str
    item_description: dict  # For structured item details like reg_no, make, model, etc.
    estimated_value: float

@dataclass
class AccusedDetails:
    """Data class for accused person's information"""
    sl_no: int
    type: str
    person_type: str
    sex: str
    age: Optional[int]
    occupation: str

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: str
    address: str
    injury_type: str
    sex: str
    age: Optional[int]
    occupation: str

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: str
    circle_subdivision: str
    police_station: str
    crime_no: str
    fir_date: datetime
    act_section: str
    offense_date: datetime
    offense_time_from: str
    offense_time_to: str
    location: str
    distance_ps: str
    complainant_name: str
    complainant_age: int
    complainant_religion : str
    complainant_caste : str
    complainant_occupation: str
    phone_number: str
    nationality: str
    sex: str
    complainant_address: str
    # accused_details: List[AccusedDetails]
    victim_details: List[VictimDetails]
    property_details: List[PropertyDetails]
    total_property_value: float

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y')}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y')}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        # print("\nAccused Details:")
        # for accused in self.accused_details:
        #     print(f"- Type: {accused.type}, Person Type: {accused.person_type}, "
        #           f"Sex: {accused.sex}, Age: {accused.age}, Occupation: {accused.occupation}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            print("  Item Description:")
            for key, value in prop.item_description.items():
                print(f"    {key}: {value}")
            print(f"  Estimated Value: Rs. {prop.estimated_value:,.2f}")
        print(f"Total Property Value: Rs. {self.total_property_value:,.2f}")
        print("==================\n")
def safe_search(pattern, text):
    """Helper function to safely perform regex search and return the matched group or None."""
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    return None

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document"""
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join([page.extract_text() for page in pdf.pages])
        
        # Basic FIR details
        district = re.search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text).group(1).strip()
        circle = re.search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text).group(1).strip()
        ps = re.search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text).group(1).strip()
        crime_no = re.search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text).group(1).strip()
        
        # Dates and times
        fir_date_str = re.search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text).group(1)
        fir_date = datetime.strptime(fir_date_str, "%d/%m/%Y")
        offense_date_str = re.search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text).group(1)
        offense_date = datetime.strptime(offense_date_str, "%d/%m/%Y")
        time_from = re.search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text).group(1)
        time_to = re.search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text).group(1)
        
        # Location and basic complainant details
        location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
        distance_ps = re.search(r"(?:(Towards\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Towards\s+\w+))", text, re.IGNORECASE)

        if distance_ps:
        # Extracting direction, distance, and unit
            direction = (distance_ps.group(1) or distance_ps.group(6)).strip()  # Match "Towards West"
            distance_value = (distance_ps.group(2) or distance_ps.group(4)).strip()  # Match distance like "1.5"
            distance_unit = (distance_ps.group(3) or distance_ps.group(5) or "").replace(" ", "").strip()  # Match unit like "KM"

            distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()

        complainant_name = re.search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text).group(1).strip()
        complainant_age = int(re.search(r"Age\s*:\s*(\d+)", text).group(1))
        complainant_religion = re.search(r"Religion\s*:\s*([^:\n]+?)(?=\s|$)", text).group(1).strip()
        complainant_caste =  re.search(r"Caste\s*:\s*([^:\n]+)", text).group(1).strip()
        complainant_occupation = re.search(r"Occupation\s*:\s*([^\n]*)", text).group(1).strip()
        
        # Additional complainant details
        phone_number = re.search(r"Phone No\.\s*:\s*(\d+)", text).group(1).strip()
        nationality = re.search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text).group(1).strip()
        sex = re.search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text).group(1).strip()
        
        # address extraction
        address_pattern = r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)"
        address_match = re.search(address_pattern, text, re.DOTALL)
        complainant_address = address_match.group(1).strip() if address_match else ""

        sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*" 
        complainant_address = re.sub(sex_pattern, "", complainant_address)
        complainant_address = " ".join(line.strip() for line in complainant_address.split('\n') if line.strip())
        
        # Helper function for parsing amounts
        def parse_amount(amount_str):
            try:
                return float(amount_str.replace(',', ''))
            except (ValueError, AttributeError):
                return 0.0
        
        # Property details extraction
        property_details = []
        if "Automobile" in text:
            prop_type = "Automobile"
            
            # Improved regex patterns for vehicle details
            reg_no = safe_search(r"Reg No:?\s*([A-Za-z0-9]+)(?=\s*(?:\d+|Make|$))", text)
            make = safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", text)
            model = safe_search(r"Model:?\s*(\d+)", text)
            engine_no = safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", text)
            chassis_no = safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", text)
            
            item_desc = {
                "reg_no": reg_no,
                "make": make,
                "model": model,
                "engine_no": engine_no,
                "chassis_no": chassis_no
            }
            
            # Improved value extraction
            value_pattern = r"Estimated Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)"
            value_match = re.search(value_pattern, text)
            estimated_value = parse_amount(value_match.group(1)) if value_match else 0.0
            
            property_details.append(PropertyDetails(1, prop_type, item_desc, estimated_value))
        
        # Improved total value extraction
        total_value_pattern = r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)"
        total_value_match = re.search(total_value_pattern, text)
        total_property_value = parse_amount(total_value_match.group(1)) if total_value_match else 0.0
        
        # # Extract accused details (simplified for unknown accused)
        # accused_details = [AccusedDetails(1, "Unknown", "", "", None, "")]
        
        # Extract victim details (empty in this case)
        victim_details = []
        
        act_section = re.search(r"Act & Section\s*:\s*([^:\n]+)", text).group(1).strip()
        
        return FIRData(
            district=district,
            circle_subdivision=circle,
            police_station=ps,
            crime_no=crime_no,
            fir_date=fir_date,
            act_section=act_section,
            offense_date=offense_date,
            offense_time_from=time_from,
            offense_time_to=time_to,
            location=location,
            distance_ps=distance_ps,
            complainant_name=complainant_name,
            complainant_age=complainant_age,
            complainant_religion=complainant_religion,
            complainant_caste=complainant_caste,
            complainant_occupation=complainant_occupation,
            phone_number=phone_number,
            nationality=nationality,
            sex=sex,
            complainant_address=complainant_address,
            # accused_details=accused_details,
            victim_details=victim_details,
            property_details=property_details,
            total_property_value=total_property_value
        )

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("G:/iisc/1382/fir_0003.pdf")
    fir.print_details()


=== FIR Details ===
District: Bengaluru City
Circle/Sub-Division: Madivala Sub-Division
Police Station: Adugodi PS
Crime Number: 0002/2024
FIR Date: 04/01/2024
Act & Section: IPC 1860 (U/s-379)

Offense Details:
Date: 03/01/2024
Time: 01:30:00 to 02:20:00
Location: Beside Of Doddamma Temple, Adugodi,Bengaluru City , Karnataka, 560030
Distance from Police Station: Towards West, 1 Km

Complainant Details:
Name: Obalesh
Age: 32
Religion: Hindu
Caste: ADI KARNATAKA
Occupation: Others PI Specify
Phone: 9731616644
Nationality: India
Sex: Male
Address: No-09, 4th Cross, 4th Main,B Chandrappa Nagara, BG Road,Adugodi , Bengaluru City , Karnataka-56003 0

Property Details:
- Type: Automobile
  Item Description:
    reg_no: KA01HW5137
    make: SCOOTER
    model: 2017
    engine_no: JF39EU2120111
    chassis_no: ME4JF39DHHU078058
  Estimated Value: Rs. 1.00
Total Property Value: Rs. 25,000.00



In [6]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: Optional[str] = None
    item_description: Dict[str, Any] = None  # For structured item details
    estimated_value: Optional[float] = None

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: Optional[str] = None
    address: Optional[str] = None
    injury_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: Optional[str] = None
    circle_subdivision: Optional[str] = None
    police_station: Optional[str] = None
    crime_no: Optional[str] = None
    fir_date: Optional[datetime] = None
    act_section: Optional[str] = None
    offense_date: Optional[datetime] = None
    offense_time_from: Optional[str] = None
    offense_time_to: Optional[str] = None
    location: Optional[str] = None
    distance_ps: Optional[str] = None
    complainant_name: Optional[str] = None
    complainant_age: Optional[int] = None
    complainant_religion: Optional[str] = None
    complainant_caste: Optional[str] = None
    complainant_occupation: Optional[str] = None
    phone_number: Optional[str] = None
    nationality: Optional[str] = None
    sex: Optional[str] = None
    complainant_address: Optional[str] = None
    victim_details: List[VictimDetails] = None
    property_details: List[PropertyDetails] = None
    total_property_value: Optional[float] = None

    def __post_init__(self):
        if self.victim_details is None:
            self.victim_details = []
        if self.property_details is None:
            self.property_details = []

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y') if self.fir_date else None}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y') if self.offense_date else None}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            if prop.item_description:
                print("  Item Description:")
                for key, value in prop.item_description.items():
                    print(f"    {key}: {value}")
            print(f"  Estimated Value: {f'Rs. {prop.estimated_value:,.2f}' if prop.estimated_value else None}")
        print(f"Total Property Value: {f'Rs. {self.total_property_value:,.2f}' if self.total_property_value else None}")
        print("==================\n")

def safe_search(pattern: str, text: str, group_num: int = 1) -> Optional[str]:
    """Helper function to safely perform regex search and return the matched group or None."""
    try:
        match = re.search(pattern, text)
        return match.group(group_num).strip() if match else None
    except (AttributeError, IndexError):
        return None

def safe_parse_date(date_str: Optional[str], format_str: str = "%d/%m/%Y") -> Optional[datetime]:
    """Safely parse date string to datetime object."""
    if not date_str:
        return None
    try:
        return datetime.strptime(date_str, format_str)
    except ValueError:
        return None

def safe_parse_int(value: Optional[str]) -> Optional[int]:
    """Safely parse string to integer."""
    if not value:
        return None
    try:
        return int(re.sub(r'[^\d]', '', value))
    except ValueError:
        return None

def safe_parse_float(amount_str: Optional[str]) -> Optional[float]:
    """Safely parse string to float, handling commas in numbers."""
    if not amount_str:
        return None
    try:
        return float(amount_str.replace(',', ''))
    except ValueError:
        return None

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document with error handling"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            
            # Basic FIR details
            district = safe_search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text)
            circle = safe_search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text)
            ps = safe_search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text)
            crime_no = safe_search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text)
            
            # Dates and times
            fir_date_str = safe_search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text)
            fir_date = safe_parse_date(fir_date_str)
            
            offense_date_str = safe_search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
            offense_date = safe_parse_date(offense_date_str)
            
            time_from = safe_search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            time_to = safe_search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            
            # Location and distance
            location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
            distance_match = re.search(r"(?:(Towards\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Towards\s+\w+))", text, re.IGNORECASE)
            
            distance_ps = None
            if distance_match:
                direction = (distance_match.group(1) or distance_match.group(6) or "").strip()
                distance_value = (distance_match.group(2) or distance_match.group(4) or "").strip()
                distance_unit = (distance_match.group(3) or distance_match.group(5) or "").strip()
                if direction and distance_value:
                    distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()
            
            # Complainant details
            complainant_name = safe_search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text)
            complainant_age_str = safe_search(r"Age\s*:\s*(\d+)", text)
            complainant_age = safe_parse_int(complainant_age_str)
            complainant_religion = safe_search(r"Religion\s*:\s*([^:\n]+?)(?=\s*(?:\([e-z]\)|$))", text)
            complainant_caste = safe_search(r"Caste\s*:\s*([^:\n]+?)(?=\s*(?:\([f-z]\)|$))", text)
            complainant_occupation = safe_search(r"Occupation\s*:\s*([^\n]*)", text)
            phone_number = safe_search(r"Phone No\.\s*:\s*(\d+)", text)
            nationality = safe_search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text)
            sex = safe_search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text)
            
            # Address extraction
            address_match = safe_search(r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)", text)
            complainant_address = None
            if address_match:
                address_text = address_match
                sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*"
                address_text = re.sub(sex_pattern, "", address_text)
                complainant_address = " ".join(line.strip() for line in address_text.split('\n') if line.strip())
            
            # Property details
            property_details = []
            if "Automobile" in text:
                item_desc = {
                    "reg_no": safe_search(r"Reg No:?\s*([A-Za-z0-9]+)(?=\s*(?:\d+|Make|$))", text),
                    "make": safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", text),
                    "model": safe_search(r"Model:?\s*(\d+)", text),
                    "engine_no": safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", text),
                    "chassis_no": safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", text)
                }
                
                value_str = safe_search(r"Estimated Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
                estimated_value = safe_parse_float(value_str)
                
                property_details.append(PropertyDetails(1, "Automobile", item_desc, estimated_value))
            
            # Total value
            total_value_str = safe_search(r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
            total_property_value = safe_parse_float(total_value_str)
            
            act_section = safe_search(r"Act & Section\s*:\s*([^:\n]+)", text)
            
            return FIRData(
                district=district,
                circle_subdivision=circle,
                police_station=ps,
                crime_no=crime_no,
                fir_date=fir_date,
                act_section=act_section,
                offense_date=offense_date,
                offense_time_from=time_from,
                offense_time_to=time_to,
                location=location,
                distance_ps=distance_ps,
                complainant_name=complainant_name,
                complainant_age=complainant_age,
                complainant_religion=complainant_religion,
                complainant_caste=complainant_caste,
                complainant_occupation=complainant_occupation,
                phone_number=phone_number,
                nationality=nationality,
                sex=sex,
                complainant_address=complainant_address,
                victim_details=[],
                property_details=property_details,
                total_property_value=total_property_value
            )
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return FIRData()  # Return empty FIRData object if processing fails

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("G:/iisc/2188/fir_0002.pdf")
    fir.print_details()


=== FIR Details ===
District: Bengaluru City
Circle/Sub-Division: Kengeri Sub Division
Police Station: Annapoorneshwari Nagar PS
Crime Number: 0002/2024
FIR Date: 04/01/2024
Act & Section: KARNATAKA POLICE ACT, 1963 (U/s-96(B))

Offense Details:
Date: 04/01/2024
Time: 03:20:00 to 03:50:00
Location: Near BDA Complex, Nagarabhavi 2nd stage,Bengaluru City , Karnataka,
560072
Distance from Police Station: towards North, 3 kms

Complainant Details:
Name: B Prakash
Age: 46
Religion: 
Caste: None
Occupation: Police officer
Phone: None
Nationality: India
Sex: ,
Address: ASI,A P Nagar PS (l) Sex: , Bengaluru City , Karnataka

Property Details:
Total Property Value: Rs. 9.00



In [49]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List
from datetime import datetime
import csv
import os

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: str
    item_description: dict
    estimated_value: float

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: str
    address: str
    injury_type: str
    sex: str
    age: Optional[int]
    occupation: str

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: str
    circle_subdivision: str
    police_station: str
    crime_no: str
    fir_date: datetime
    act_section: str
    offense_date: datetime
    offense_time_from: str
    offense_time_to: str
    location: str
    distance_ps: str
    complainant_name: str
    complainant_age: int
    complainant_religion: str
    complainant_caste: str
    complainant_occupation: str
    phone_number: str
    nationality: str
    sex: str
    complainant_address: str
    victim_details: List[VictimDetails]
    property_details: List[PropertyDetails]
    total_property_value: float

    def get_property_type(self):
        """Get the main type of property involved in the case"""
        if not self.property_details:
            return "No Property"
        return self.property_details[0].property_type if self.property_details else "Unknown"

    def to_dict(self):
        """Convert FIR data to a flat dictionary suitable for CSV export"""
        # Format the distance_ps tuple if it exists
        distance_str = ""
        if isinstance(self.distance_ps, tuple) and len(self.distance_ps) >= 2:
            direction = self.distance_ps[0] if self.distance_ps[0] else ""
            distance = self.distance_ps[1] if self.distance_ps[1] else ""
            unit = self.distance_ps[2] if len(self.distance_ps) > 2 and self.distance_ps[2] else "KM"
            distance_str = f"{direction} {distance} {unit}".strip()

        base_dict = {
            'district': self.district,
            'circle_subdivision': self.circle_subdivision,
            'police_station': self.police_station,
            'crime_no': self.crime_no,
            'fir_date': self.fir_date.strftime('%d/%m/%Y'),
            'act_section': self.act_section,
            'offense_date': self.offense_date.strftime('%d/%m/%Y'),
            'offense_time_from': self.offense_time_from,
            'offense_time_to': self.offense_time_to,
            'location': self.location,
            'distance_from_ps': distance_str,
            'complainant_name': self.complainant_name,
            'complainant_age': self.complainant_age,
            'complainant_religion': self.complainant_religion,
            'complainant_caste': self.complainant_caste,
            'complainant_occupation': self.complainant_occupation,
            'phone_number': self.phone_number,
            'nationality': self.nationality,
            'sex': self.sex,
            'complainant_address': self.complainant_address,
            'property_type_category': self.get_property_type(),  # New field for property type
            'total_property_value': f"{self.total_property_value:,.2f}"
        }

        # Add victim details
        for i, victim in enumerate(self.victim_details, 1):
            prefix = f'victim_{i}_'
            base_dict.update({
                f'{prefix}name': victim.name,
                f'{prefix}address': victim.address,
                f'{prefix}injury_type': victim.injury_type,
                f'{prefix}sex': victim.sex,
                f'{prefix}age': victim.age if victim.age else '',
                f'{prefix}occupation': victim.occupation
            })

        # Add property details
        for i, prop in enumerate(self.property_details, 1):
            prefix = f'property_{i}_'
            base_dict.update({
                f'{prefix}type': prop.property_type,
                f'{prefix}estimated_value': f"{prop.estimated_value:,.2f}"
            })
            # Add item description details
            for key, value in prop.item_description.items():
                base_dict[f'{prefix}{key}'] = value if value else ''

        return base_dict

def save_firs_to_csv(firs: List[FIRData], output_file: str):
    """Save a list of FIR data to a CSV file"""
    if not firs:
        return

    # Get all possible fields from all FIRs
    fieldnames = set()
    for fir in firs:
        fieldnames.update(fir.to_dict().keys())
    
    # Sort fields for consistent column ordering
    # Put property_type_category near the beginning for better visibility
    fieldnames = sorted(list(fieldnames))
    if 'property_type_category' in fieldnames:
        fieldnames.remove('property_type_category')
        fieldnames.insert(10, 'property_type_category')  # Insert after basic FIR details

    # Write to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for fir in firs:
            writer.writerow(fir.to_dict())

def process_fir_documents(input_dir: str, output_file: str):
    """Process all PDF files in a directory and save to CSV"""
    firs = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_dir, filename)
            try:
                fir = extract_fir_data(pdf_path)
                firs.append(fir)
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    save_firs_to_csv(firs, output_file)

# Modified main execution
if __name__ == "__main__":
    # For processing a single file
    # fir = extract_fir_data("fir_0005.pdf")
    # save_firs_to_csv([fir], "fir_data1.csv")
    process_fir_documents('G:/iisc/1382', 'G:/iisc/1382/fir_dataset.csv')


In [None]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime
import pandas as pd
import os
from pathlib import Path

# [Previous class definitions remain the same...]

def create_flat_dict(fir: FIRData) -> Dict[str, Any]:
    """Convert FIR data to a flat dictionary suitable for CSV export"""
    flat_dict = {
        'district': fir.district,
        'circle_subdivision': fir.circle_subdivision,
        'police_station': fir.police_station,
        'crime_no': fir.crime_no,
        'fir_date': fir.fir_date.strftime('%d/%m/%Y') if fir.fir_date else None,
        'act_section': fir.act_section,
        'offense_date': fir.offense_date.strftime('%d/%m/%Y') if fir.offense_date else None,
        'offense_time_from': fir.offense_time_from,
        'offense_time_to': fir.offense_time_to,
        'location': fir.location,
        'distance_ps': fir.distance_ps,
        'complainant_name': fir.complainant_name,
        'complainant_age': fir.complainant_age,
        'complainant_religion': fir.complainant_religion,
        'complainant_caste': fir.complainant_caste,
        'complainant_occupation': fir.complainant_occupation,
        'phone_number': fir.phone_number,
        'nationality': fir.nationality,
        'sex': fir.sex,
        'complainant_address': fir.complainant_address,
        'total_property_value': fir.total_property_value
    }

    # Add property details
    if fir.property_details:
        for i, prop in enumerate(fir.property_details, 1):
            prefix = f'property_{i}_'
            flat_dict[f'{prefix}type'] = prop.property_type
            flat_dict[f'{prefix}value'] = prop.estimated_value
            if prop.item_description:
                for key, value in prop.item_description.items():
                    flat_dict[f'{prefix}{key}'] = value

    # Add victim details
    if fir.victim_details:
        for i, victim in enumerate(fir.victim_details, 1):
            prefix = f'victim_{i}_'
            flat_dict[f'{prefix}name'] = victim.name
            flat_dict[f'{prefix}address'] = victim.address
            flat_dict[f'{prefix}injury_type'] = victim.injury_type
            flat_dict[f'{prefix}sex'] = victim.sex
            flat_dict[f'{prefix}age'] = victim.age
            flat_dict[f'{prefix}occupation'] = victim.occupation

    return flat_dict

def process_fir_files(input_folder: str, output_file: str):
    # Get list of all PDF files in the input folder
    pdf_files = list(Path(input_folder).glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {input_folder}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    # Process each FIR
    all_data = []
    failed_files = []
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"Processing file {i}/{len(pdf_files)}: {pdf_file.name}")
        try:
            fir = extract_fir_data(str(pdf_file))
            flat_data = create_flat_dict(fir)
            flat_data['file_name'] = pdf_file.name  # Add filename for reference
            all_data.append(flat_data)
        except Exception as e:
            print(f"Failed to process {pdf_file.name}: {str(e)}")
            failed_files.append(pdf_file.name)
    
    if not all_data:
        print("No data was successfully extracted")
        return
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    # Print summary
    print("\nProcessing Summary:")
    print(f"Total files processed: {len(pdf_files)}")
    print(f"Successfully processed: {len(all_data)}")
    print(f"Failed to process: {len(failed_files)}")
    
    if failed_files:
        print("\nFailed files:")
        for file in failed_files:
            print(f"- {file}")
    
    # Print dataset statistics
    print("\nDataset Statistics:")
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    
    # Print sample of key columns
    key_columns = ['district', 'police_station', 'crime_no']
    print("\nUnique values in key columns:")
    for col in key_columns:
        if col in df.columns:
            unique_count = df[col].nunique()
            print(f"{col}: {unique_count} unique values")

if __name__ == "__main__":
    # Example usage
    input_folder = "G:/iisc/2188"  # Folder containing FIR PDFs
    output_file = "G:/iisc/2188/fir_dataset1.csv"  # Where to save the CSV
    
    process_fir_files(input_folder, output_file)

Found 5 PDF files to process
Processing file 1/5: fir_0001.pdf
Processing file 2/5: fir_0002.pdf
Processing file 3/5: fir_0003.pdf
Processing file 4/5: fir_0004.pdf
Processing file 5/5: fir_0005.pdf

Processing Summary:
Total files processed: 5
Successfully processed: 5
Failed to process: 0

Dataset Statistics:
Total rows: 5
Total columns: 22

Unique values in key columns:
district: 1 unique values
police_station: 1 unique values
crime_no: 5 unique values


In [14]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: Optional[str] = None
    item_description: Dict[str, Any] = None  # For structured item details
    estimated_value: Optional[float] = None

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: Optional[str] = None
    address: Optional[str] = None
    injury_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class AccusedDetails:
    """Data class for accused person information"""
    sl_no: int
    name: Optional[str] = None
    accused_type: Optional[str] = None  
    person_type: Optional[str] = None   
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: Optional[str] = None
    circle_subdivision: Optional[str] = None
    police_station: Optional[str] = None
    crime_no: Optional[str] = None
    fir_date: Optional[datetime] = None
    act_section: Optional[str] = None
    offense_date: Optional[datetime] = None
    offense_time_from: Optional[str] = None
    offense_time_to: Optional[str] = None
    location: Optional[str] = None
    distance_ps: Optional[str] = None
    complainant_name: Optional[str] = None
    complainant_age: Optional[int] = None
    complainant_religion: Optional[str] = None
    complainant_caste: Optional[str] = None
    complainant_occupation: Optional[str] = None
    phone_number: Optional[str] = None
    nationality: Optional[str] = None
    sex: Optional[str] = None
    complainant_address: Optional[str] = None
    victim_details: List[VictimDetails] = None
    property_details: List[PropertyDetails] = None
    total_property_value: Optional[float] = None
    accused_details: List[AccusedDetails] = None

    def __post_init__(self):
        if self.victim_details is None:
            self.victim_details = []
        if self.property_details is None:
            self.property_details = []
        if self.accused_details is None:
            self.accused_details = []

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y') if self.fir_date else None}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y') if self.offense_date else None}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            if prop.item_description:
                print("  Item Description:")
                for key, value in prop.item_description.items():
                    print(f"    {key}: {value}")
            print(f"  Estimated Value: {f'Rs. {prop.estimated_value:,.2f}' if prop.estimated_value else None}")
        print(f"Total Property Value: {f'Rs. {self.total_property_value:,.2f}' if self.total_property_value else None}")

        print("\nAccused Details:")
        for accused in self.accused_details:
            print(f"\nAccused #{accused.sl_no}:")
            print(f"Name: {accused.name}")
            print(f"Father's Name: {accused.father_name}")
            print(f"Address: {accused.address}")
            print(f"Person Type: {accused.person_type}")
            print(f"Accused Type: {accused.accused_type}")
            print(f"Sex: {accused.sex}")
            print(f"Age: {accused.age}")
            print(f"Occupation: {accused.occupation}")
            print(f"Caste: {accused.caste}")
        print("==================\n")

def safe_search(pattern: str, text: str, group_num: int = 1) -> Optional[str]:
    """Helper function to safely perform regex search and return the matched group or None."""
    try:
        match = re.search(pattern, text)
        return match.group(group_num).strip() if match else None
    except (AttributeError, IndexError):
        return None

def safe_parse_date(date_str: Optional[str], format_str: str = "%d/%m/%Y") -> Optional[datetime]:
    """Safely parse date string to datetime object."""
    if not date_str:
        return None
    try:
        return datetime.strptime(date_str, format_str)
    except ValueError:
        return None

def safe_parse_int(value: Optional[str]) -> Optional[int]:
    """Safely parse string to integer."""
    if not value:
        return None
    try:
        return int(re.sub(r'[^\d]', '', value))
    except ValueError:
        return None

def safe_parse_float(amount_str: Optional[str]) -> Optional[float]:
    """Safely parse string to float, handling commas in numbers."""
    if not amount_str:
        return None
    try:
        return float(amount_str.replace(',', ''))
    except ValueError:
        return None
    
def extract_accused_details(text: str) -> List[AccusedDetails]:
    """Extract accused details from the FIR text"""
    accused_list = []
    
    # Find the section containing accused details
    accused_section = safe_search(
        r"Details of known/suspected/unknown accused[\s\S]*?(?=Details of Victims|$)",
        text
    )
    
    if not accused_section:
        return accused_list
    
    # Simplified pattern to only capture required fields
    # Captures: name before (A1)/(A2), type, person type, sex, age, occupation
    accused_pattern = r"(\d+)\s*([^(\n]+?)\s*(?:\([Aa][12]\))?\s*/.*?\s*(Accused|Unknown|Suspect)\s*(Common\s*man|[^\n,]*?)\s*(Male|Female|Unknown)\s*(\d+)?\s*([^\n]*?)(?=\d+|$|Details)"
    
    # Find all matches in the section
    matches = re.finditer(accused_pattern, accused_section)
    
    for match in matches:
        try:
            sl_no = safe_parse_int(match.group(1))
            if not sl_no:  # Skip if no valid serial number
                continue
            
            # Create AccusedDetails object with only required fields
            accused = AccusedDetails(
                sl_no=sl_no,
                name=match.group(2).strip() if match.group(2) else None,
                accused_type=match.group(3).strip() if match.group(3) else None,
                person_type=match.group(4).strip() if match.group(4) else None,
                sex=match.group(5).strip() if match.group(5) else None,
                age=safe_parse_int(match.group(6)) if match.group(6) else None,
                occupation=match.group(7).strip() if match.group(7) else None
            )
            
            accused_list.append(accused)
            
        except Exception as e:
            print(f"Error processing accused entry: {str(e)}")
            continue
    
    return accused_list

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document with error handling"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            
            # Basic FIR details
            district = safe_search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text)
            circle = safe_search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text)
            ps = safe_search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text)
            crime_no = safe_search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text)
            
            # Dates and times
            fir_date_str = safe_search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text)
            fir_date = safe_parse_date(fir_date_str)
            
            offense_date_str = safe_search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
            offense_date = safe_parse_date(offense_date_str)
            
            time_from = safe_search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            time_to = safe_search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            
            # Location and distance
            location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
            distance_match = re.search(r"(?:(Tow(?:ards|ords)\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Tow(?:ards|ords)\s+\w+))", text, re.IGNORECASE)
            
            distance_ps = None
            if distance_match:
                direction = (distance_match.group(1) or distance_match.group(6) or "").strip()
                distance_value = (distance_match.group(2) or distance_match.group(4) or "").strip()
                distance_unit = (distance_match.group(3) or distance_match.group(5) or "").strip()
                if direction and distance_value:
                    distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()
            
            # Complainant details
            complainant_name = safe_search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text)
            complainant_age_str = safe_search(r"Age\s*:\s*(\d+)", text)
            complainant_age = safe_parse_int(complainant_age_str)
            complainant_religion = safe_search(r"Religion\s*:\s*([^:\n]+?)(?=\s*(?:\([e-z]\)|$))", text)
            complainant_caste = safe_search(r"Caste\s*:\s*([^:\n]+?)(?=\s*(?:\([f-z]\)|$))", text)
            complainant_occupation = safe_search(r"Occupation\s*:\s*([^\n]*)", text)
            phone_number = safe_search(r"Phone No\.\s*:\s*(\d+)", text)
            nationality = safe_search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text)
            sex = safe_search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text)
            
            # Address extraction
            address_match = safe_search(r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)", text)
            complainant_address = None
            if address_match:
                address_text = address_match
                sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*"
                address_text = re.sub(sex_pattern, "", address_text)
                complainant_address = " ".join(line.strip() for line in address_text.split('\n') if line.strip())
            
            # Property details
            property_details = []
            if "Automobile" in text:
                item_desc = {
                    "reg_no": safe_search(r"Reg No:?\s*([A-Za-z0-9]+)(?=\s*(?:\d+|Make|$))", text),
                    "make": safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", text),
                    "model": safe_search(r"Model:?\s*(\d+)", text),
                    "engine_no": safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", text),
                    "chassis_no": safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", text)
                }
                
                value_str = safe_search(r"Estimated Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
                estimated_value = safe_parse_float(value_str)
                
                property_details.append(PropertyDetails(1, "Automobile", item_desc, estimated_value))
            
            # Total value
            total_value_str = safe_search(r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
            total_property_value = safe_parse_float(total_value_str)
            
            act_section = safe_search(r"Act & Section\s*:\s*([^:\n]+)", text)

            accused_details = extract_accused_details(text)
            
            return FIRData(
                district=district,
                circle_subdivision=circle,
                police_station=ps,
                crime_no=crime_no,
                fir_date=fir_date,
                act_section=act_section,
                offense_date=offense_date,
                offense_time_from=time_from,
                offense_time_to=time_to,
                location=location,
                distance_ps=distance_ps,
                complainant_name=complainant_name,
                complainant_age=complainant_age,
                complainant_religion=complainant_religion,
                complainant_caste=complainant_caste,
                complainant_occupation=complainant_occupation,
                phone_number=phone_number,
                nationality=nationality,
                sex=sex,
                complainant_address=complainant_address,
                victim_details=[],
                property_details=property_details,
                total_property_value=total_property_value,
                accused_details=accused_details
            )
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return FIRData()  # Return empty FIRData object if processing fails

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("G:/iisc/combined/fir_0008.pdf")
    fir.print_details()


=== FIR Details ===
District: Chikkamagaluru
Circle/Sub-Division: Chickmagalur Sub-Division
Police Station: Chickmagalur Rural PS
Crime Number: 0010/2023
FIR Date: 10/01/2023
Act & Section: IPC 1860 (U/s-504,324,323,506,34)

Offense Details:
Date: 10/01/2023
Time: 09:30:00 to 09:31:00
Location: SARVE NO 17 AGRICULTURE LAND, ATTHIGIRI VILLAGE,Chikkamagaluru ,
Karnataka,
Distance from Police Station: TOWORDS EAST, 35 KM

Complainant Details:
Name: KRISHNAPPA
Age: 64
Religion: Hindu
Caste: ADI KARNATAKA
Occupation: Farmer
Phone: 948107558
Nationality: India
Sex: Male
Address: SHANTHAVERI LINGADHAHALLI HOBALI,THARIK ERE TQ , Chikkamagaluru , Karnataka

Property Details:
Total Property Value: Rs. 9.00

Accused Details:



In [3]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime

@dataclass
class PropertyDetails:
    """Data class for stolen/involved property information"""
    sl_no: int
    property_type: Optional[str] = None
    item_description: Dict[str, Any] = None  
    estimated_value: Optional[float] = None

@dataclass
class VictimDetails:
    """Data class for victim information"""
    sl_no: int
    name: Optional[str] = None
    address: Optional[str] = None
    injury_type: Optional[str] = None
    sex: Optional[str] = None
    age: Optional[int] = None
    occupation: Optional[str] = None

@dataclass
class FIRData:
    """Data class to store extracted FIR information"""
    district: Optional[str] = None
    circle_subdivision: Optional[str] = None
    police_station: Optional[str] = None
    crime_no: Optional[str] = None
    fir_date: Optional[datetime] = None
    act_section: Optional[str] = None
    offense_date: Optional[datetime] = None
    offense_time_from: Optional[str] = None
    offense_time_to: Optional[str] = None
    location: Optional[str] = None
    distance_ps: Optional[str] = None
    complainant_name: Optional[str] = None
    complainant_age: Optional[int] = None
    complainant_religion: Optional[str] = None
    complainant_caste: Optional[str] = None
    complainant_occupation: Optional[str] = None
    phone_number: Optional[str] = None
    nationality: Optional[str] = None
    sex: Optional[str] = None
    complainant_address: Optional[str] = None
    victim_details: List[VictimDetails] = None
    property_details: List[PropertyDetails] = None
    total_property_value: Optional[float] = None

    def __post_init__(self):
        if self.victim_details is None:
            self.victim_details = []
        if self.property_details is None:
            self.property_details = []

    def print_details(self):
        """Print all FIR details in a formatted manner"""
        print("\n=== FIR Details ===")
        print(f"District: {self.district}")
        print(f"Circle/Sub-Division: {self.circle_subdivision}")
        print(f"Police Station: {self.police_station}")
        print(f"Crime Number: {self.crime_no}")
        print(f"FIR Date: {self.fir_date.strftime('%d/%m/%Y') if self.fir_date else None}")
        print(f"Act & Section: {self.act_section}")
        
        print("\nOffense Details:")
        print(f"Date: {self.offense_date.strftime('%d/%m/%Y') if self.offense_date else None}")
        print(f"Time: {self.offense_time_from} to {self.offense_time_to}")
        print(f"Location: {self.location}")
        print(f"Distance from Police Station: {self.distance_ps}")

        print("\nComplainant Details:")
        print(f"Name: {self.complainant_name}")
        print(f"Age: {self.complainant_age}")
        print(f"Religion: {self.complainant_religion}")
        print(f"Caste: {self.complainant_caste}")
        print(f"Occupation: {self.complainant_occupation}")
        print(f"Phone: {self.phone_number}")
        print(f"Nationality: {self.nationality}")
        print(f"Sex: {self.sex}")
        print(f"Address: {self.complainant_address}")
        
        print("\nProperty Details:")
        for prop in self.property_details:
            print(f"- Type: {prop.property_type}")
            if prop.item_description:
                print("  Item Description:")
                for key, value in prop.item_description.items():
                    print(f"    {key}: {value}")
            print(f"  Estimated Value: {f'Rs. {prop.estimated_value:,.2f}' if prop.estimated_value else None}")
        print(f"Total Property Value: {f'Rs. {self.total_property_value:,.2f}' if self.total_property_value else None}")
        print("==================\n")

def safe_search(pattern: str, text: str, group_num: int = 1) -> Optional[str]:
    """Helper function to safely perform regex search and return the matched group or None."""
    try:
        match = re.search(pattern, text)
        return match.group(group_num).strip() if match else None
    except (AttributeError, IndexError):
        return None

def safe_parse_date(date_str: Optional[str], format_str: str = "%d/%m/%Y") -> Optional[datetime]:
    """Safely parse date string to datetime object."""
    if not date_str:
        return None
    try:
        return datetime.strptime(date_str, format_str)
    except ValueError:
        return None

def safe_parse_int(value: Optional[str]) -> Optional[int]:
    """Safely parse string to integer."""
    if not value:
        return None
    try:
        return int(re.sub(r'[^\d]', '', value))
    except ValueError:
        return None

def safe_parse_float(amount_str: Optional[str]) -> Optional[float]:
    """Safely parse string to float, handling commas in numbers."""
    if not amount_str:
        return None
    try:
        return float(amount_str.replace(',', ''))
    except ValueError:
        return None

def extract_property_details(text: str) -> List[PropertyDetails]:
    properties = []
    
    # Find all property entries using registration numbers as anchor points
    reg_numbers = re.finditer(r"Reg No:\s*(KA\d+[A-Z]+\d+)", text)
    reg_positions = [(m.group(1), m.start()) for m in reg_numbers]
    
    if not reg_positions:
        return properties
    
    for idx, (reg_no, start_pos) in enumerate(reg_positions):
        end_pos = reg_positions[idx + 1][1] if idx < len(reg_positions) - 1 else len(text)
        property_text = text[start_pos:end_pos]
        
        item_desc = {
            "reg_no": reg_no,
            "make": safe_search(r"Make:?\s*([^\n]+?)(?=\s*Model|$)", property_text),
            "model": safe_search(r"Model:?\s*([^\n]+?)(?=\s*Engine|$)", property_text),
            "engine_no": safe_search(r"Engine No:?\s*([^\n]+?)(?=\s*Chassis|$)", property_text),
            "chassis_no": safe_search(r"Chassis No:?\s*([^\n]+?)(?=\n|$)", property_text)
        }
        
        value_str = safe_search(r"Estimated Value\s*\([^)]*\)\s*\n*\s*([\d,]+)", text)
        estimated_value = safe_parse_float(value_str) or 0
        
        properties.append(PropertyDetails(
            sl_no=idx + 1,
            property_type="Automobile",
            item_description=item_desc,
            estimated_value=estimated_value
        ))
    
    return properties

def extract_fir_data(pdf_path: str) -> FIRData:
    """Extract relevant information from FIR PDF document with error handling"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = " ".join([page.extract_text() for page in pdf.pages])
            
            # Basic FIR details
            district = safe_search(r"District\s*:\s*([^:\n]+?)(?=\s+Circle/Sub Division|$)", text)
            circle = safe_search(r"Circle/Sub Division\s*:\s*([^:\n]+?)(?=\s+PS|$)", text)
            ps = safe_search(r"PS\s*:\s*([^:\n]+?)(?=\s*$|\s*\n)", text)
            crime_no = safe_search(r"Crime No\s*:\s*([^:\n]+?)(?=\s+FIR Date|$)", text)
            
            # Dates and times
            fir_date_str = safe_search(r"FIR Date\s*:\s*(\d{2}/\d{1,2}/\d{4})", text)
            fir_date = safe_parse_date(fir_date_str)
            
            offense_date_str = safe_search(r"From Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
            offense_date = safe_parse_date(offense_date_str)
            
            time_from = safe_search(r"From Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            time_to = safe_search(r"To Time\s*:\s*(\d{2}:\d{2}:\d{2})", text)
            
            # Location and distance
            location = re.search(r"Place of occurence with full address\s*(.*?)(?=\(b\))", text, re.DOTALL).group(1).strip()
            distance_match = re.search(r"(?:(Tow(?:ards|ords)\s+\w+)\s*([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?)|(?:([\d\.]+)\s*(K\s*M|KM|Km|KMs|Meters|M)?\s*(Tow(?:ards|ords)\s+\w+))", text, re.IGNORECASE)
            
            distance_ps = None
            if distance_match:
                direction = (distance_match.group(1) or distance_match.group(6) or "").strip()
                distance_value = (distance_match.group(2) or distance_match.group(4) or "").strip()
                distance_unit = (distance_match.group(3) or distance_match.group(5) or "").strip()
                if direction and distance_value:
                    distance_ps = f"{direction}, {distance_value} {distance_unit}".strip()
            
            # Complainant details
            complainant_name = safe_search(r"Name\s*:\s*([^:\n]+?)(?=\s+Father)", text)
            complainant_age_str = safe_search(r"Age\s*:\s*(\d+)", text)
            complainant_age = safe_parse_int(complainant_age_str)
            complainant_religion = safe_search(r"Religion\s*:\s*([^:\n]+?)(?=\s*(?:\([e-z]\)|$))", text)
            complainant_caste = safe_search(r"Caste\s*:\s*([^:\n]+?)(?=\s*(?:\([f-z]\)|$))", text)
            complainant_occupation = safe_search(r"Occupation\s*:\s*([^\n]*)", text)
            phone_number = safe_search(r"Phone No\.\s*:\s*(\d+)", text)
            nationality = safe_search(r"Nationality\s*:\s*([^:\n]+?)(?=\s|$)", text)
            sex = safe_search(r"Sex:\s*([^:\n]+?)(?=\s|$)", text)
            
            # Address extraction
            address_match = safe_search(r"\(k\)\s*Address\s*:\s*([\s\S]*?)(?=\([m-z]\)|Whether complainant|$)", text)
            complainant_address = None
            if address_match:
                address_text = address_match
                sex_pattern = r"\(l\)\s*Sex:\s*[A-Za-z]+\s*"
                address_text = re.sub(sex_pattern, "", address_text)
                complainant_address = " ".join(line.strip() for line in address_text.split('\n') if line.strip())
            
            # Property details
            property_details = extract_property_details(text)
            
            # Total value
            total_value_str = safe_search(r"Total Value[^:]*:?\s*(?:Rs\.?)?\s*(\d+,?\d*)", text)
            total_property_value = safe_parse_float(total_value_str)
            
            act_section = safe_search(r"Act & Section\s*:\s*([^:\n]+)", text)
            
            return FIRData(
                district=district,
                circle_subdivision=circle,
                police_station=ps,
                crime_no=crime_no,
                fir_date=fir_date,
                act_section=act_section,
                offense_date=offense_date,
                offense_time_from=time_from,
                offense_time_to=time_to,
                location=location,
                distance_ps=distance_ps,
                complainant_name=complainant_name,
                complainant_age=complainant_age,
                complainant_religion=complainant_religion,
                complainant_caste=complainant_caste,
                complainant_occupation=complainant_occupation,
                phone_number=phone_number,
                nationality=nationality,
                sex=sex,
                complainant_address=complainant_address,
                victim_details=[],
                property_details=property_details,
                total_property_value=total_property_value
            )
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return FIRData()  # Return empty FIRData object if processing fails

if __name__ == "__main__":
    # Process single FIR
    fir = extract_fir_data("G:/iisc/combined/fir_0006.pdf")
    fir.print_details()


=== FIR Details ===
District: Bengaluru City
Circle/Sub-Division: South East Traffic Sub Divisio
Police Station: Adugodi Traffic PS
Crime Number: 0001/2024
FIR Date: 02/01/2024
Act & Section: IPC 1860 (U/s-279,337)

Offense Details:
Date: 02/01/2024
Time: 00:55:00 to 01:05:00
Location: 7th cross-8th main Junction, 3rd Block,, Koramangala,Bengaluru City ,
Karnataka,
Distance from Police Station: towards south, 2 km

Complainant Details:
Name: Sri Karan Sharma
Age: 31
Religion: Hindu
Caste: BRAHMIN
Occupation: Employed in private firms
Phone: 7042234269
Nationality: India
Sex: Male
Address: #101, 1st Floor, RVR Heights, Kasavanahalli,Sar japura Main road, , Bengaluru City , Karnataka-56003 5

Property Details:
- Type: Automobile
  Item Description:
    reg_no: KA03NM3266
    make: CAR
    model: -
    engine_no: -
    chassis_no: -
  Estimated Value: Rs. 1.00
- Type: Automobile
  Item Description:
    reg_no: KA05AL3674
    make: SCOOTER
    model: -
    engine_no: -
    chassis_no: -
 

In [4]:
import pdfplumber
import re
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
from datetime import datetime
import pandas as pd
import os
from pathlib import Path

def create_flat_dict(fir: FIRData) -> Dict[str, Any]:
    """Convert FIR data to a flat dictionary suitable for CSV export"""
    flat_dict = {
        'district': fir.district,
        'circle_subdivision': fir.circle_subdivision,
        'police_station': fir.police_station,
        'crime_no': fir.crime_no,
        'fir_date': fir.fir_date.strftime('%d/%m/%Y') if fir.fir_date else None,
        'act_section': fir.act_section,
        'offense_date': fir.offense_date.strftime('%d/%m/%Y') if fir.offense_date else None,
        'offense_time_from': fir.offense_time_from,
        'offense_time_to': fir.offense_time_to,
        'location': fir.location,
        'distance_ps': fir.distance_ps,
        'complainant_name': fir.complainant_name,
        'complainant_age': fir.complainant_age,
        'complainant_religion': fir.complainant_religion,
        'complainant_caste': fir.complainant_caste,
        'complainant_occupation': fir.complainant_occupation,
        'phone_number': fir.phone_number,
        'nationality': fir.nationality,
        'sex': fir.sex,
        'complainant_address': fir.complainant_address,
        'total_property_value': fir.total_property_value
    }

    # Add property details
    if fir.property_details:
        for i, prop in enumerate(fir.property_details, 1):
            prefix = f'property_{i}_'
            flat_dict[f'{prefix}type'] = prop.property_type
            flat_dict[f'{prefix}value'] = prop.estimated_value
            if prop.item_description:
                for key, value in prop.item_description.items():
                    flat_dict[f'{prefix}{key}'] = value

    if fir.victim_details:
        for i, victim in enumerate(fir.victim_details, 1):
            prefix = f'victim_{i}_'
            flat_dict[f'{prefix}name'] = victim.name
            flat_dict[f'{prefix}address'] = victim.address
            flat_dict[f'{prefix}injury_type'] = victim.injury_type
            flat_dict[f'{prefix}sex'] = victim.sex
            flat_dict[f'{prefix}age'] = victim.age
            flat_dict[f'{prefix}occupation'] = victim.occupation

    return flat_dict

def process_fir_files(input_folder: str, output_file: str):
    """
    Process all FIR PDF files in a folder and create a CSV dataset
    
    Args:
        input_folder (str): Path to folder containing FIR PDFs
        output_file (str): Path where CSV file should be saved
    """
    pdf_files = list(Path(input_folder).glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {input_folder}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    all_data = []
    failed_files = []
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"Processing file {i}/{len(pdf_files)}: {pdf_file.name}")
        try:
            fir = extract_fir_data(str(pdf_file))
            flat_data = create_flat_dict(fir)
            flat_data['file_name'] = pdf_file.name 
            all_data.append(flat_data)
        except Exception as e:
            print(f"Failed to process {pdf_file.name}: {str(e)}")
            failed_files.append(pdf_file.name)
    
    if not all_data:
        print("No data was successfully extracted")
        return
    
    df = pd.DataFrame(all_data)
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    print("\nProcessing Summary:")
    print(f"Total files processed: {len(pdf_files)}")
    print(f"Successfully processed: {len(all_data)}")
    print(f"Failed to process: {len(failed_files)}")
    
    if failed_files:
        print("\nFailed files:")
        for file in failed_files:
            print(f"- {file}")
    
    print("\nDataset Statistics:")
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    
    key_columns = ['district', 'police_station', 'crime_no']
    print("\nUnique values in key columns:")
    for col in key_columns:
        if col in df.columns:
            unique_count = df[col].nunique()
            print(f"{col}: {unique_count} unique values")

if __name__ == "__main__":
    input_folder = "G:/iisc/combined" 
    output_file = "G:/iisc/combined/fir_dataset1.csv"  
    
    process_fir_files(input_folder, output_file)

Found 10 PDF files to process
Processing file 1/10: fir_0001.pdf
Processing file 2/10: fir_0002.pdf
Processing file 3/10: fir_0003.pdf
Processing file 4/10: fir_0004.pdf
Processing file 5/10: fir_0005.pdf
Processing file 6/10: fir_0006.pdf
Processing file 7/10: fir_0007.pdf
Processing file 8/10: fir_0008.pdf
Processing file 9/10: fir_0009.pdf
Processing file 10/10: fir_0010.pdf

Processing Summary:
Total files processed: 10
Successfully processed: 10
Failed to process: 0

Dataset Statistics:
Total rows: 10
Total columns: 36

Unique values in key columns:
district: 3 unique values
police_station: 4 unique values
crime_no: 6 unique values
