<a href="https://colab.research.google.com/github/RUTUPARNk/Practised/blob/main/Trading_OCr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kagglehub




🔍 Exploring dataset structure...
📁 test: 16 files


In [3]:
!df -h

Filesystem                                                                                                                    Size  Used Avail Use% Mounted on
overlay                                                                                                                       108G   47G   62G  44% /
tmpfs                                                                                                                          64M     0   64M   0% /dev
shm                                                                                                                           5.8G     0  5.8G   0% /dev/shm
/dev/root                                                                                                                     2.0G  1.2G  750M  62% /usr/sbin/docker-init
/dev/sda1                                                                                                                      73G   52G   22G  71% /kaggle/input
tmpfs                                            

In [4]:
!git clone https://github.com/PaddlePaddle/PaddleOCR.git

fatal: destination path 'PaddleOCR' already exists and is not an empty directory.


In [22]:
!cd PaddleOCR && pip install -r requirements.txt




In [23]:
import os
import json
from PIL import Image
from transformers import pipeline
import requests

In [6]:
# EasyOCR Pipeline - Process only 10 TIFF images
!pip install easyocr

import easyocr
import os
import re
import cv2
import numpy as np
from PIL import Image
from transformers import pipeline

class EasyOCRInvoiceProcessor:
    def __init__(self):
        print("🔄 Initializing EasyOCR...")
        self.reader = easyocr.Reader(['en'])
        self.parser = pipeline("text2text-generation", model="google/flan-t5-base")
        print("✅ EasyOCR ready!")

    def extract_text(self, image_path):
        try:
            # Run OCR with optimized settings for invoices
            results = self.reader.readtext(
                image_path,
                detail=1,
                paragraph=False,
                min_size=10,
                contrast_ths=0.1,
                adjust_contrast=0.3
            )

            if not results:
                return None

            # Extract only high-confidence text
            high_confidence_text = ""
            for bbox, text, confidence in results:
                if confidence > 0.4:  # Only keep decent confidence detections
                    high_confidence_text += text + " "

            return high_confidence_text.strip() if high_confidence_text else None

        except Exception as e:
            print(f"  ❌ OCR Error: {e}")
            return None

    def extract_total(self, text):
        if not text:
            return None

        # Improved regex patterns for invoice totals
        patterns = [
            r'total[\s:]*[\$]?[\s]*([\d,]+\.?\d{2})',
            r'grand[\s]*total[\s:]*[\$]?[\s]*([\d,]+\.?\d{2})',
            r'amount[\s:]*due[\s:]*[\$]?[\s]*([\d,]+\.?\d{2})',
            r'balance[\s:]*due[\s:]*[\$]?[\s]*([\d,]+\.?\d{2})',
            r'\$[\s]*([\d,]+\.?\d{2})',
            r'total.*?(\d{1,3}(?:,\d{3})*\.\d{2})',
        ]

        all_matches = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            for match in matches:
                try:
                    amount = float(match.replace(',', ''))
                    all_matches.append(amount)
                except:
                    continue

        if all_matches:
            # Take the largest amount (usually the total)
            return max(all_matches)

        return None

# ========== MAIN PROCESSING - ONLY 10 FILES ==========

print("🚀 Starting EasyOCR Invoice Processing (10 files only)")
print("=" * 60)

# Find directory
invoice_dir = None
possible_paths = [
    "/kaggle/input/the-rvlcdip-dataset-test/test/invoice",
    "/kaggle/input/the-rvlcdip-dataset-test/invoice",
    "/kaggle/input/the-rvlcdip-dataset-test/test",
    "/kaggle/input/the-rvlcdip-dataset-test",
]

for path in possible_paths:
    if os.path.exists(path):
        invoice_dir = path
        print(f"✅ Found directory: {path}")
        break

if not invoice_dir:
    print("❌ Directory not found")
    exit()

# Get ONLY .tif files and take FIRST 10
tif_files = [f for f in os.listdir(invoice_dir) if f.lower().endswith('.tif')]
print(f"📁 Found {len(tif_files)} TIFF files total")
print(f"🔬 Processing FIRST 10 files only")

# Take only first 10 files
files_to_process = tif_files[:10]
print(f"📋 Files to process: {files_to_process}")

# ========== PROCESS THE 10 FILES ==========

processor = EasyOCRInvoiceProcessor()

print(f"\n🎯 PROCESSING {len(files_to_process)} FILES:")
print("=" * 60)

results = []

for i, filename in enumerate(files_to_process, 1):
    image_path = os.path.join(invoice_dir, filename)

    print(f"\n{i}. 📄 {filename}")
    print("-" * 40)

    try:
        # Quick image info
        with Image.open(image_path) as img:
            print(f"   Size: {img.size}px, Mode: {img.mode}")

        # Extract text
        text = processor.extract_text(image_path)

        if text:
            # Show first 100 chars of extracted text
            print(f"   📝 Text: {text[:100]}...")

            # Extract total
            total = processor.extract_total(text)

            if total:
                print(f"   💰 TOTAL: ${total:.2f}")
                results.append((filename, total, "SUCCESS"))
            else:
                print(f"   ❌ No total found in text")
                results.append((filename, None, "NO TOTAL"))
        else:
            print(f"   🚫 No text extracted")
            results.append((filename, None, "NO TEXT"))

    except Exception as e:
        print(f"   💥 ERROR: {e}")
        results.append((filename, None, f"ERROR: {e}"))

# ========== FINAL SUMMARY ==========

print(f"\n" + "=" * 60)
print("📊 FINAL RESULTS (10 files)")
print("=" * 60)

successful = len([r for r in results if r[2] == "SUCCESS"])
no_total = len([r for r in results if r[2] == "NO TOTAL"])
no_text = len([r for r in results if r[2] == "NO TEXT"])
errors = len([r for r in results if "ERROR" in r[2]])

print(f"✅ Successful: {successful}/10")
print(f"❌ No total found: {no_total}/10")
print(f"🚫 No text: {no_text}/10")
print(f"💥 Errors: {errors}/10")

if successful > 0:
    success_rate = (successful / 10) * 100
    print(f"🎯 Success rate: {success_rate:.0f}%")

    print(f"\n💰 EXTRACTED TOTALS:")
    for filename, total, status in results:
        if status == "SUCCESS":
            print(f"   ✅ {filename}: ${total:.2f}")

print("=" * 60)

# Save simple results
with open('invoice_totals_10_files.txt', 'w') as f:
    f.write("Invoice Totals - 10 Files\n")
    f.write("=" * 30 + "\n")
    for filename, total, status in results:
        if status == "SUCCESS":
            f.write(f"✅ {filename}: ${total:.2f}\n")
        else:
            f.write(f"❌ {filename}: {status}\n")

print("💾 Results saved to 'invoice_totals_10_files.txt'")
print("✅ Processing complete!")





🚀 Starting EasyOCR Invoice Processing (10 files only)
✅ Found directory: /kaggle/input/the-rvlcdip-dataset-test/test/invoice
📁 Found 2477 TIFF files total
🔬 Processing FIRST 10 files only
📋 Files to process: ['518254491+-4510.tif', '00555621.tif', '518494145+-4146.tif', '2028724139.tif', '83554925.tif', '2024526337.tif', '2028697451.tif', '03724982.tif', '2063178328.tif', '2028748016.tif']
🔄 Initializing EasyOCR...


Device set to use cpu


✅ EasyOCR ready!

🎯 PROCESSING 10 FILES:

1. 📄 518254491+-4510.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text: vly RANSHEIER SPELLHAN Professional Corporation One Capitol Street P 0 Box 600 Concord _ NH 03302-06...
   💰 TOTAL: $775.45

2. 📄 00555621.tif
----------------------------------------
   Size: (772, 1000)px, Mode: L




   📝 Text: VA_ 22046 Invoice No.21992 893-5400 70 Dec 04 INVOICE Date: To: The Tobacco Institute Atto: Mrs Kitt...
   💰 TOTAL: $848364.47

3. 📄 518494145+-4146.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text: 2270400 10/03/96 277-0009 40-01 Box 2959, Winctan-Salom; NC 27102 INVOICE Pooge (8001 862-4338 Roler...
   ❌ No total found in text

4. 📄 2028724139.tif
----------------------------------------
   Size: (777, 1000)px, Mode: L




   📝 Text: 3 3 3 1 3 3 ; 9 1 h 1 8 8 V 6 1 3 8 0 li 1 1 8 Fa 2 4 8 1 1 0 1 8 4 #H 4 2 3 J # 1 aeqister 1...
   ❌ No total found in text

5. 📄 83554925.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text: IEMPO I Remittance copy Invoice ForM With Paymen DATE INVOICE NO NEWSMONITORING NETWORE 5/5/97 5709 ...
   ❌ No total found in text

6. 📄 2024526337.tif
----------------------------------------
   Size: (777, 1000)px, Mode: L




   📝 Text: i201 PENNSYLVANIC AVENUE; #ashington 20 EMPLOYER identification No 10394 8841 STATEMENT PHILIP MQBRI...
   ❌ No total found in text

7. 📄 2028697451.tif
----------------------------------------
   Size: (777, 1000)px, Mode: L




   📝 Text: USINE (FRANCE) UCEE420 558 F PP PLZ 781 3302 RRANCE 570 24/01/91 24/01/91 39021OOOOOOON 10 1330 N BE...
   💰 TOTAL: $108.00

8. 📄 03724982.tif
----------------------------------------
   Size: (766, 1000)px, Mode: L




   📝 Text: CovINGTON BURLING 20006 (202] 5 7 ACCOUNT No, 11,486 DATE July 9 , 1980 AMERICAN BRANDS IGCORPORATED...
   ❌ No total found in text

9. 📄 2063178328.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text: FoR ACCOUNTING USE ONLY PHILIP MORRIS INCORPORATED V0 U CHER (omit if No SPECIFIC DATE REQUIRED DATE...
   ❌ No total found in text

10. 📄 2028748016.tif
----------------------------------------
   Size: (777, 1000)px, Mode: L




   📝 Text: FAKTUUR FSANDVIK 940117 132363 26355000 Buhande d door FE Franc ECCERHONT Heceir , MAIL registe" Fak...
   ❌ No total found in text

📊 FINAL RESULTS (10 files)
✅ Successful: 3/10
❌ No total found: 7/10
🚫 No text: 0/10
💥 Errors: 0/10
🎯 Success rate: 30%

💰 EXTRACTED TOTALS:
   ✅ 518254491+-4510.tif: $775.45
   ✅ 00555621.tif: $848364.47
   ✅ 2028697451.tif: $108.00
💾 Results saved to 'invoice_totals_10_files.txt'
✅ Processing complete!
