In [2]:
!pip install spacy tqdm
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import random
import json
from typing import List, Dict, Tuple, Any
from spacy.training import Example
from spacy.lang.en import English

class NERTrainer:
    def __init__(self):
        print("\n=== Initializing NER Trainer ===")

    def load_data(self, filepath: str) -> List[Dict[str, Any]]:
        """Load and validate data from JSON file."""
        print(f"Loading and validating training data from: {filepath}")
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # Filter valid examples and augment data if necessary
        valid_data = [item for item in data if self.validate_example(item)]
        print(f"Loaded {len(data)} examples, {len(valid_data)} valid after cleaning")
        augmented_data = self.augment_data(valid_data)
        print(f"Augmented data size: {len(augmented_data)} examples")
        return augmented_data

    def validate_example(self, example: Dict[str, Any]) -> bool:
        """Placeholder validation for a single training example."""
        return 'text' in example and 'entities' in example

    def augment_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Placeholder for data augmentation."""
        # Placeholder implementation; in real scenarios, could add noise or synonyms
        return data

    def prepare_training_data(self, data: List[Dict], split: float = 0.2) -> Tuple[List[Dict], List[Dict]]:
        """Split data into training and validation sets"""
        print("\nPreparing training and validation splits...")
        random.shuffle(data)
        split_point = int(len(data) * (1 - split))
        train_data = data[:split_point]
        val_data = data[split_point:]
        print(f"Training examples: {len(train_data)}")
        print(f"Validation examples: {len(val_data)}")
        return train_data, val_data

    def train_model(self, train_data: List[Dict], n_iter: int = 10):
        """Train NER model"""
        nlp = English()  # Load blank model or existing one
        # Add necessary pipeline components and training loop
        print("\nTraining model...")

        # Training loop logic here
        # Placeholder for demonstration
        for i in range(n_iter):
            print(f"Iteration {i+1}/{n_iter}")
            # Simulate training process

    def evaluate_model(self, nlp, examples: List[Example]) -> Dict:
        """Evaluate model performance"""
        print("\nEvaluating model...")
        scores = {
            "ents_p": 0,
            "ents_r": 0,
            "ents_f": 0,
            "ents_per_type": {}
        }

        for example in examples:
            pred_ents = nlp(example.reference.text).ents
            gold_ents = example.reference.ents

            # Calculate true positives, false positives, and false negatives
            tp = len([ent for ent in pred_ents if any(
                gold_ent.label_ == ent.label_ and
                gold_ent.start == ent.start and
                gold_ent.end == ent.end
                for gold_ent in gold_ents
            )])
            fp = len(pred_ents) - tp
            fn = len(gold_ents) - tp

            # Calculate precision, recall, and F1
            p = tp / (tp + fp) if (tp + fp) > 0 else 0
            r = tp / (tp + fn) if (tp + fn) > 0 else 0
            f = 2 * (p * r) / (p + r) if (p + r) > 0 else 0

            scores["ents_p"] += p
            scores["ents_r"] += r
            scores["ents_f"] += f

        # Calculate averages
        n = len(examples) if len(examples) > 0 else 1
        scores["ents_p"] /= n
        scores["ents_r"] /= n
        scores["ents_f"] /= n

        print("\nEvaluation results:")
        print(f"Precision: {scores['ents_p']}")
        print(f"Recall: {scores['ents_r']}")
        print(f"F1 Score: {scores['ents_f']}")

        return scores


def main():
    print("\n=== Initializing NER Trainer ===")
    trainer = NERTrainer()

    # Load data
    data = trainer.load_data("spacy_training_data.json")

    # Split data
    train_data, val_data = trainer.prepare_training_data(data)

    # Train model
    trainer.train_model(train_data, n_iter=10)

    # Evaluate model with dummy data for demonstration
    # You would convert val_data into spaCy Example objects for real evaluation
    nlp = English()  # Replace with your trained model
    dummy_examples = []  # Placeholder for spaCy Example objects for validation data
    scores = trainer.evaluate_model(nlp, dummy_examples)

if __name__ == "__main__":
    main()



=== Initializing NER Trainer ===

=== Initializing NER Trainer ===
Loading and validating training data from: spacy_training_data.json
Loaded 17 examples, 17 valid after cleaning
Augmented data size: 17 examples

Preparing training and validation splits...
Training examples: 13
Validation examples: 4

Training model...
Iteration 1/10
Iteration 2/10
Iteration 3/10
Iteration 4/10
Iteration 5/10
Iteration 6/10
Iteration 7/10
Iteration 8/10
Iteration 9/10
Iteration 10/10

Evaluating model...

Evaluation results:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [3]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple
from tqdm import tqdm
import re

class NERTrainer:
    def __init__(self):
        print("\n=== Initializing NER Trainer ===")

    def clean_text_and_entities(self, text: str, entities: List[Tuple[int, int, str]]) -> Tuple[str, List[Tuple[int, int, str]]]:
        """Clean text and adjust entity positions"""
        # Normalize whitespace
        lines = [line.strip() for line in text.split('\n')]
        clean_text = '\n'.join(line for line in lines if line)

        # Adjust entity positions
        adjusted_entities = []
        for start, end, label in entities:
            entity_text = text[start:end].strip()
            new_start = clean_text.find(entity_text)
            if new_start != -1:
                adjusted_entities.append((
                    new_start,
                    new_start + len(entity_text),
                    label
                ))

        return clean_text, adjusted_entities

    def validate_entity(self, text: str, start: int, end: int, label: str) -> bool:
        """Validate if entity position is correct"""
        if start >= 0 and end <= len(text) and start < end:
            entity_text = text[start:end].strip()
            if label == "ACC_NO":
                # Account number should contain digits
                return bool(re.search(r'\d', entity_text))
            elif label == "PER":
                # Name should contain letters
                return bool(re.search(r'[A-Za-z]', entity_text))
        return False

    def load_and_clean_data(self, file_path: str) -> List[Dict]:
        """Load and clean training data"""
        print(f"\nLoading and cleaning data from: {file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        cleaned_data = []
        for item in data:
            # Clean text and adjust entities
            clean_text, adjusted_entities = self.clean_text_and_entities(
                item["text"],
                item["entities"]
            )

            # Validate entities
            valid_entities = [
                (start, end, label)
                for start, end, label in adjusted_entities
                if self.validate_entity(clean_text, start, end, label)
            ]

            if valid_entities:
                cleaned_data.append({
                    "text": clean_text,
                    "entities": valid_entities
                })

        print(f"Loaded {len(data)} examples, cleaned to {len(cleaned_data)} valid examples")
        return cleaned_data

    def augment_data(self, data: List[Dict]) -> List[Dict]:
        """Augment training data with variations"""
        print("\nAugmenting training data...")
        augmented_data = []

        for item in data:
            text = item["text"]
            entities = item["entities"]

            # Add original example
            augmented_data.append(item)

            # Add variations only for account numbers
            acc_variations = []
            for start, end, label in entities:
                if label == "ACC_NO":
                    entity_text = text[start:end]

                    # Remove all spaces
                    no_spaces = entity_text.replace(" ", "")
                    if no_spaces != entity_text:
                        new_text = text[:start] + no_spaces + text[end:]
                        acc_variations.append({
                            "text": new_text,
                            "entities": [(start, start + len(no_spaces), "ACC_NO")]
                        })

                    # Add spaces every 4 digits
                    spaced = ' '.join(re.findall('.{1,4}', no_spaces))
                    if spaced != entity_text:
                        new_text = text[:start] + spaced + text[end:]
                        acc_variations.append({
                            "text": new_text,
                            "entities": [(start, start + len(spaced), "ACC_NO")]
                        })

            # Add valid variations
            for var in acc_variations:
                if all(self.validate_entity(var["text"], start, end, label)
                      for start, end, label in var["entities"]):
                    augmented_data.append(var)

        print(f"Original examples: {len(data)}")
        print(f"Augmented examples: {len(augmented_data)}")
        return augmented_data

    def train_model(self, train_data: List[Dict], val_data: List[Dict],
                   output_dir: str, n_iter: int = 100):
        """Train the NER model"""
        print("\nStarting model training...")

        # Initialize blank English model
        nlp = spacy.blank("en")

        # Add NER pipeline
        if "ner" not in nlp.pipe_names:
            ner = nlp.add_pipe("ner")

        # Add labels
        for item in train_data:
            for _, _, label in item["entities"]:
                ner.add_label(label)

        # Training settings
        optimizer = nlp.begin_training()

        # Training loop
        for itn in tqdm(range(n_iter), desc="Training"):
            random.shuffle(train_data)
            losses = {}

            # Batch training
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for text_data in batch:
                    doc = nlp.make_doc(text_data["text"])
                    example = Example.from_dict(doc, {"entities": text_data["entities"]})
                    examples.append(example)

                nlp.update(examples, drop=0.2, losses=losses)

        # Save model
        nlp.to_disk(output_dir)
        return nlp

def main():
    trainer = NERTrainer()

    # Load and clean data
    data = trainer.load_and_clean_data("spacy_training_data.json")

    # Augment data
    data = trainer.augment_data(data)

    # Split data
    random.shuffle(data)
    split = int(len(data) * 0.8)
    train_data = data[:split]
    val_data = data[split:]

    print(f"\nTraining examples: {len(train_data)}")
    print(f"Validation examples: {len(val_data)}")

    # Create output directory
    output_dir = "trained_model"
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Train model
    trainer.train_model(train_data, val_data, output_dir)
    print(f"\nModel saved in: {output_dir}/")

if __name__ == "__main__":
    main()


=== Initializing NER Trainer ===

Loading and cleaning data from: spacy_training_data.json
Loaded 17 examples, cleaned to 17 valid examples

Augmenting training data...
Original examples: 17
Augmented examples: 32

Training examples: 25
Validation examples: 7

Starting model training...


Search
Account Number
634301509..." with entities "[(41, 53, 'ACC_NO'), (61, 80, 'PER')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Training: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


Model saved in: trained_model/





In [2]:
!pip install spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl.metadata (4.8 kB)
Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl (98.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.5


In [13]:
import spacy
import fitz  # PyMuPDF
from pathlib import Path
import json
from typing import Dict, List, Any
import re

class BankStatementExtractor:
    def __init__(self, model_path: str):
        print(f"\n=== Loading NER model from {model_path} ===")
        self.nlp = spacy.load(model_path)
        print("✓ Model loaded successfully")

    def extract_text(self, pdf_path: str) -> str:
        """Extract text from PDF"""
        print(f"\nExtracting text from: {pdf_path}")

        try:
            with fitz.open(pdf_path) as doc:
                text = ""
                for page in doc:
                    text += page.get_text()

                # Clean text
                text = re.sub(r'\s+', ' ', text)
                text = text.strip()
                print(f"✓ Extracted {len(text)} characters")
                return text
        except Exception as e:
            print(f"❌ Error extracting text: {str(e)}")
            return ""

    def process_statement(self, pdf_path: str) -> Dict[str, Any]:
        """Process a single bank statement"""
        # Extract text
        text = self.extract_text(pdf_path)
        if not text:
            return {"error": "Failed to extract text"}

        print("\nProcessing text with NER model...")
        doc = self.nlp(text)

        # Extract entities
        entities = []
        for ent in doc.ents:
            entities.append({
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char
            })

        # Group by entity type
        results = {
            "account_numbers": [],
            "names": [],
            "raw_entities": entities
        }

        for ent in entities:
            if ent["label"] == "ACC_NO":
                results["account_numbers"].append(ent["text"])
            elif ent["label"] == "PER":
                results["names"].append(ent["text"])

        print("\nExtracted Entities:")
        print(f"Account Numbers: {results['account_numbers']}")
        print(f"Names: {results['names']}")

        return results

def test_single_pdf(model_path: str, pdf_path: str) -> None:
    """Test model on a single PDF"""
    extractor = BankStatementExtractor(model_path)
    results = extractor.process_statement(pdf_path)

    # Save results
    output_file = Path(pdf_path).stem + "_extraction.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nResults saved to: {output_file}")

def main():
    # Paths
    model_path = "trained_model"

    # Test single PDF
    pdf_path = "/content/Axis 1-12-22 to 30-11-23.pdf"
    test_single_pdf(model_path, pdf_path)

if __name__ == "__main__":
    main()



=== Loading NER model from trained_model ===
✓ Model loaded successfully

Extracting text from: /content/Axis 1-12-22 to 30-11-23.pdf
✓ Extracted 133403 characters

Processing text with NER model...

Extracted Entities:
Account Numbers: ['940810948', '922020023522300', '76813', '029979', '001398', '002625', '76823', '76827', '76831', '76826', '76842', '022463', '76849', '268304.54', '002706', '328132.79', '5900.00', '5900.00', '76863', '031148', '933510.57', '557030.09', '022626', '99067.29', '9440.00', '001604', '002781', '002798', '97896', '97897', '266508', '97898', '97900', '97899', '97901', '97902', '97908', '97907', '97909', '97912', '733017.77', '576716.84', '97914', '97913', '111332', '091642', '25724.00', '5900.00', '97917', '97916', '97911', '140959', '940094.81', '97918', '501998.33', '500529.33', '505409.33', '319007.33', '319757.33', '97920', '97921', '97923', '002905', '512995', '01930419229167', '035062969229001', '54402.72', '091747', '040488', '594194', '001694', '211

In [11]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


In [15]:
def find_best_matches(results: dict) -> dict:
    """Find the most likely primary account number and account holder name"""

    def score_account_number(acc_no: str) -> float:
        """Score account numbers based on characteristics of typical bank account numbers"""
        score = 0.0

        # Skip if not a string
        if not isinstance(acc_no, str):
            return 0.0

        # Length (most bank account numbers are between 10-16 digits)
        clean_acc = acc_no.replace(" ", "").replace("CR", "").replace("DR", "")
        if 10 <= len(clean_acc) <= 16:
            score += 3.0

        # Contains only digits and spaces
        if all(c.isdigit() or c.isspace() for c in clean_acc):
            score += 2.0

        # Not a transaction amount (doesn't contain decimal point)
        if "." not in acc_no:
            score += 2.0

        # Not too short
        if len(clean_acc) >= 8:
            score += 1.0

        # Appears early in the document
        for entity in results.get("raw_entities", []):
            if entity["text"] == acc_no and entity["label"] == "ACC_NO":
                if entity["start"] < 1000:  # Early in document
                    score += 2.0
                break

        return score

    def score_name(name: str) -> float:
        """Score names based on characteristics of typical account holder names"""
        score = 0.0

        # Skip if not a string
        if not isinstance(name, str):
            return 0.0

        # Not a transaction description
        if not any(x in name.upper() for x in ["RTGS", "NEFT", "UPI", "CR", "DR", "/", "SUPPLIE", "HUB"]):
            score += 3.0

        # Contains multiple words (typical for full names)
        if len(name.split()) >= 2:
            score += 2.0

        # Proper case (either Title case or UPPER case)
        if name.istitle() or name.isupper():
            score += 1.0

        # Not too long (typical names are 2-4 words)
        if len(name.split()) <= 4:
            score += 1.0

        # Appears early in document
        for entity in results.get("raw_entities", []):
            if entity["text"] == name and entity["label"] == "PER":
                if entity["start"] < 1000:  # Early in document
                    score += 2.0
                break

        return score

    # Get lists from results
    account_numbers = results.get("account_numbers", [])
    names = results.get("names", [])

    # Score and sort account numbers
    acc_numbers = [(acc, score_account_number(acc))
                  for acc in account_numbers]
    acc_numbers = [(acc, score) for acc, score in acc_numbers if score > 0]
    acc_numbers.sort(key=lambda x: x[1], reverse=True)

    # Score and sort names
    names = [(name, score_name(name))
            for name in names]
    names = [(name, score) for name, score in names if score > 0]
    names.sort(key=lambda x: x[1], reverse=True)

    best_matches = {
        "best_account_number": {
            "text": acc_numbers[0][0] if acc_numbers else None,
            "confidence": acc_numbers[0][1] if acc_numbers else 0
        },
        "best_name": {
            "text": names[0][0] if names else None,
            "confidence": names[0][1] if names else 0
        },
        "all_scored_accounts": acc_numbers[:5],  # Top 5 matches
        "all_scored_names": names[:5]  # Top 5 matches
    }

    return best_matches

# Example usage:
with open("/content/Axis 1-12-22 to 30-11-23_extraction.json", "r") as f:
    results = json.load(f)

# Process the results directly (no need for indexing)
best_matches = find_best_matches(results)  # Remove the [0]

print("\nBest Matches:")
print(f"Account Number: {best_matches['best_account_number']['text']}")
print(f"Confidence: {best_matches['best_account_number']['confidence']:.2f}")
print(f"\nAccount Holder: {best_matches['best_name']['text']}")
print(f"Confidence: {best_matches['best_name']['confidence']:.2f}")

print("\nTop 5 Account Numbers:")
for acc, score in best_matches['all_scored_accounts']:
    print(f"- {acc} (score: {score:.2f})")

print("\nTop 5 Names:")
for name, score in best_matches['all_scored_names']:
    print(f"- {name} (score: {score:.2f})")


Best Matches:
Account Number: 922020023522300
Confidence: 10.00

Account Holder: S AMC TRADERS
Confidence: 7.00

Top 5 Account Numbers:
- 922020023522300 (score: 10.00)
- 922020023522300 (score: 10.00)
- 01930419229167 (score: 8.00)
- 035062969229001 (score: 8.00)
- 33329229713 (score: 8.00)

Top 5 Names:
- S AMC TRADERS (score: 7.00)
- S AMC TRADERS (score: 7.00)
- S NAIM TOOLS (score: 7.00)
- S NAIM TOOLS (score: 7.00)
- S CUTTIN 509689.00 (score: 7.00)


In [22]:
import spacy
import fitz  # PyMuPDF
from pathlib import Path
import json
import re
from typing import Dict, List, Any
from datetime import datetime
import pandas as pd
from tqdm import tqdm

class BankStatementProcessor:
    def __init__(self, model_path: str = "trained_model"):
        print("\n=== Initializing Bank Statement Processor ===")
        self.nlp = spacy.load(model_path)

    def process_directory(self, input_dir: str, output_dir: str = "results"):
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        # Get all PDF files, excluding macOS hidden files
        pdf_files = [
            f for f in Path(input_dir).glob("**/*.pdf")
            if not f.name.startswith("._")
        ]

        print(f"\nFound {len(pdf_files)} valid PDF files to process")

        results = []
        for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
            try:
                # Skip if file can't be opened (encrypted)
                try:
                    doc = fitz.open(pdf_path)
                    doc.close()
                except:
                    print(f"\nSkipping encrypted/invalid PDF: {pdf_path.name}")
                    continue

                result = self.process_single_pdf(pdf_path)
                result["file_name"] = pdf_path.name
                results.append(result)

            except Exception as e:
                print(f"\nError processing {pdf_path.name}: {str(e)}")
                results.append({
                    "file_name": pdf_path.name,
                    "error": str(e)
                })

        self.save_results(results, output_dir)
        return results

    def process_single_pdf(self, pdf_path: str) -> Dict:
        """Process a single PDF file"""
        # Extract text
        text = self.extract_text(pdf_path)

        # Extract entities
        entities = self.extract_entities(text)

        # Find best matches
        best_matches = self.find_best_matches(entities)

        return {
            "best_matches": best_matches,
            "all_entities": entities,
            "processed_at": datetime.now().isoformat()
        }

    def extract_text(self, pdf_path: str) -> str:
        """Extract text from PDF"""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()

        return self.clean_text(text)

    def clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep essential ones
        text = re.sub(r'[^\w\s\-.,:/]', '', text)

        return text.strip()

    def extract_entities(self, text: str) -> Dict:
        """Extract entities using spaCy model"""
        doc = self.nlp(text)

        entities = {
            "account_numbers": [],
            "names": [],
            "raw_entities": []
        }

        for ent in doc.ents:
            entities["raw_entities"].append({
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char
            })

            if ent.label_ == "ACC_NO":
                entities["account_numbers"].append(ent.text)
            elif ent.label_ == "PER":
                entities["names"].append(ent.text)

        return entities

    def find_best_matches(self, entities: Dict) -> Dict:
        """Find best account number and name matches"""
        def score_account_number(acc_no: str) -> float:
            score = 0.0
            clean_acc = acc_no.replace(" ", "").replace("CR", "").replace("DR", "")

            # Length check
            if 10 <= len(clean_acc) <= 16:
                score += 3.0

            # Digit check
            if all(c.isdigit() or c.isspace() for c in clean_acc):
                score += 2.0

            # Not a transaction amount
            if "." not in acc_no:
                score += 2.0

            return score

        def score_name(name: str) -> float:
            score = 0.0

            # Not a transaction description
            if not any(x in name.upper() for x in ["RTGS", "NEFT", "UPI", "CR", "DR", "/", "SUPPLIE", "HUB"]):
                score += 3.0

            # Multiple words
            if len(name.split()) >= 2:
                score += 2.0

            # Proper case
            if name.istitle() or name.isupper():
                score += 1.0

            return score

        # Score and sort
        acc_numbers = [(acc, score_account_number(acc))
                      for acc in entities["account_numbers"]]
        acc_numbers = [(acc, score) for acc, score in acc_numbers if score > 0]
        acc_numbers.sort(key=lambda x: x[1], reverse=True)

        names = [(name, score_name(name))
                for name in entities["names"]]
        names = [(name, score) for name, score in names if score > 0]
        names.sort(key=lambda x: x[1], reverse=True)

        return {
            "account_number": {
                "text": acc_numbers[0][0] if acc_numbers else None,
                "confidence": acc_numbers[0][1] if acc_numbers else 0
            },
            "account_holder": {
                "text": names[0][0] if names else None,
                "confidence": names[0][1] if names else 0
            }
        }

    def save_results(self, results: List[Dict], output_dir: str):
        """Save results in multiple formats"""
        # Save detailed JSON
        json_path = Path(output_dir) / "detailed_results.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        # Create simplified CSV
        csv_data = []
        for result in results:
            if "error" in result:
                csv_data.append({
                    "file_name": result["file_name"],
                    "account_number": "ERROR",
                    "account_holder": "ERROR",
                    "error": result["error"]
                })
            else:
                csv_data.append({
                    "file_name": result["file_name"],
                    "account_number": result["best_matches"]["account_number"]["text"],
                    "account_holder": result["best_matches"]["account_holder"]["text"],
                    "account_number_confidence": result["best_matches"]["account_number"]["confidence"],
                    "account_holder_confidence": result["best_matches"]["account_holder"]["confidence"]
                })

        # Save CSV
        df = pd.DataFrame(csv_data)
        csv_path = Path(output_dir) / "extracted_data.csv"
        df.to_csv(csv_path, index=False)

        print(f"\nResults saved to:")
        print(f"- Detailed JSON: {json_path}")
        print(f"- CSV Summary: {csv_path}")

def test_single_pdf(model_path: str, pdf_path: str):
    """Test the model on a single PDF"""
    print(f"\n=== Testing Single PDF: {Path(pdf_path).name} ===")

    processor = BankStatementProcessor(model_path)

    try:
        # Extract text first
        text = processor.extract_text(pdf_path)
        print("\nExtracted Text (first 500 characters):")
        print("="*80)
        print(text[:500])
        print("="*80)
        print(f"\nTotal text length: {len(text)} characters")

        # Process the text
        result = processor.process_single_pdf(pdf_path)

        print("\nExtracted Information:")
        print(f"Account Number: {result['best_matches']['account_number']['text']}")
        print(f"Confidence: {result['best_matches']['account_number']['confidence']:.2f}")
        print(f"\nAccount Holder: {result['best_matches']['account_holder']['text']}")
        print(f"Confidence: {result['best_matches']['account_holder']['confidence']:.2f}")

        print("\nAll Found Account Numbers with Context:")
        for entity in result['all_entities']['raw_entities']:
            if entity['label'] == 'ACC_NO':
                start = max(0, entity['start'] - 30)
                end = min(len(text), entity['end'] + 30)
                context = text[start:end]
                print(f"\n- {entity['text']}")
                print(f"  Context: ...{context}...")

        print("\nAll Found Names with Context:")
        for entity in result['all_entities']['raw_entities']:
            if entity['label'] == 'PER':
                start = max(0, entity['start'] - 30)
                end = min(len(text), entity['end'] + 30)
                context = text[start:end]
                print(f"\n- {entity['text']}")
                print(f"  Context: ...{context}...")

        return result

    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

def main():
    # Configuration
    input_dir = "/content/SHORT_PAGES_BANK_STATEMENTS -2"
    output_dir = "results"
    model_path = "trained_model"

    # Ask user what to do
    print("\n1. Process entire directory")
    print("2. Test single PDF")
    choice = input("\nEnter your choice (1 or 2): ")

    if choice == "1":
        # Process directory
        processor = BankStatementProcessor(model_path)
        results = processor.process_directory(input_dir, output_dir)

        # Print summary
        print("\nProcessing Summary:")
        total = len(results)
        errors = sum(1 for r in results if "error" in r)
        successful = total - errors

        print(f"Total files processed: {total}")
        print(f"Successful: {successful} ({successful/total*100:.1f}%)")
        print(f"Errors: {errors} ({errors/total*100:.1f}%)")

    elif choice == "2":
        # Test single PDF
        pdf_path = "/content/aiyaz jupiter.pdf"
        test_single_pdf(model_path, pdf_path)

    else:
        print("Invalid choice!")

if __name__ == "__main__":
    main()


1. Process entire directory
2. Test single PDF

Enter your choice (1 or 2): 2

=== Testing Single PDF: aiyaz jupiter.pdf ===

=== Initializing Bank Statement Processor ===

Extracted Text (first 500 characters):
247 Phone Banking 1800 425 1199 1800 420 1199 www.federal.co.in Name AIYAZ ANWAR QURESHI Branch Name Fintech Partnerships Jupiter Communication Address plot no 1/5/35, gajanan colony, govandi, abdul hamid marg, Mumbai, Maharashtra, 400043 Branch Address Federal Bank Ltd, Fintech Partnerships Department Integrated Startup Complex, KSUM, HMT Colony, Kalamassery, Ernakulam, Kerala - 683503 Address Last Updated On 17/01/2022 Branch Sol ID 7777 Registered Mobile Number 917738810525 Account Number 777

Total text length: 33149 characters

Extracted Information:
Account Number: 917738810525
Confidence: 7.00

Account Holder: None
Confidence: 0.00

All Found Account Numbers with Context:

- 917738810525
  Context: ...7777 Registered Mobile Number 917738810525 Account Number 77770106311

In [16]:
import zipfile
import os

# Path to the ZIP file
zip_path = '/content/SHORT_PAGES_BANK_STATEMENTS -2.zip'

# Destination directory for extracted files
extract_to = '/content/SHORT_PAGES_BANK_STATEMENTS -2'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Files extracted to: {extract_to}")

# List all files to verify
for root, dirs, files in os.walk(extract_to):
    for file in files:
        print(os.path.join(root, file))


Files extracted to: /content/SHORT_PAGES_BANK_STATEMENTS -2
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/KOTAK_BANK/CPSPM_40108290_1704838422.PDF
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/Canara_BANK/BCCB-1 (1).pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/Canara_BANK/Canara 2 format .pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/TJSB_BANK/Copy of Copy of TJSB.pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/BHARAT_BALAJI_BANK/AccountStatement_Evolve - 2023-12-27T134118.679.pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/BHARAT_BALAJI_BANK/Bharat_Balaji_1Jan24_23Feb24.pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/PNB_BANK/Copy of pnbbank.pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEMENTS -2/YES_BANK/yesbank.pdf
/content/SHORT_PAGES_BANK_STATEMENTS -2/SHORT_PAGES_BANK_STATEM