<a href="https://colab.research.google.com/github/RUTUPARNk/Practised/blob/main/vllm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pdavpoojan/the-rvlcdip-dataset-test")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/pdavpoojan/the-rvlcdip-dataset-test?dataset_version_number=1...


100%|██████████| 3.62G/3.62G [00:44<00:00, 86.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/pdavpoojan/the-rvlcdip-dataset-test/versions/1


In [11]:
# Test Groq connection separately
from groq import Groq

def test_groq_connection():
    client = Groq(api_key="gsk34343")  # Make sure this is correct

    test_prompt = "What is 2+2? Answer with just the number."

    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": test_prompt}],
            model="llama-3.1-8b-instant",
            temperature=0.1,
            max_tokens=10
        )
        print(f"✅ Groq Test Response: '{response.choices[0].message.content}'")
        return True
    except Exception as e:
        print(f"❌ Groq Test Failed: {e}")
        return False

# Run this test first
test_groq_connection()

✅ Groq Test Response: '4'


True

In [14]:
# FINAL WORKING Groq Invoice Processor
!pip install easyocr groq wandb
import groq
import os
import re
import wandb
from datetime import datetime
from PIL import Image

class WorkingGroqInvoiceProcessor:
    def __init__(self):
        print("🔄 Initializing EasyOCR and Groq Client...")
        import easyocr
        self.reader = easyocr.Reader(['en'])

        # Use the working model from your test
        self.client = groq.Groq(api_key="gsk_")  # Your working key
        print("✅ Groq Client ready with llama-3.1-8b-instant!")

    def extract_text(self, image_path):
        """Extract text using EasyOCR"""
        try:
            results = self.reader.readtext(image_path, detail=1, paragraph=False)

            if not results:
                return None

            # Combine all text with confidence
            all_text = " ".join([text for _, text, conf in results if conf > 0.3])
            return all_text.strip() if all_text else None

        except Exception as e:
            print(f"  ❌ OCR Error: {e}")
            return None

    def extract_total_with_groq(self, text):
        """Use the working Groq model to extract total amount"""
        if not text:
            return None

        # Clean and limit text to avoid token limits
        clean_text = ' '.join(text.split()[:150])  # Limit tokens

        prompt = f"""
        EXTRACT THE TOTAL AMOUNT FROM THIS INVOICE TEXT.
        RETURN ONLY THE NUMERIC VALUE, NO EXPLANATION, NO CURRENCY SYMBOLS.

        INVOICE TEXT: "{clean_text}"

        IMPORTANT:
        - Look for patterns like "Total", "Amount Due", "Balance", "Grand Total"
        - Return only the number like 123.45
        - If no total found, return 0

        NUMBER:
        """

        try:
            print(f"  🤖 Calling Groq API...")
            response = self.client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.1-8b-instant",  # Use the working model
                temperature=0.0,  # Zero temperature for consistent output
                max_tokens=15,
                timeout=30
            )

            result_text = response.choices[0].message.content.strip()
            print(f"  📨 Groq Raw Response: '{result_text}'")

            # Extract number from response
            numbers = re.findall(r'\d+[.,]?\d*', result_text)
            if numbers:
                # Clean the number (handle commas, etc.)
                clean_num = numbers[0].replace(',', '').replace(' ', '')
                try:
                    amount = float(clean_num)
                    if amount > 0:
                        print(f"  ✅ Groq Success: ${amount:.2f}")
                        return amount
                except ValueError:
                    pass

            print(f"  ❌ Groq failed to extract valid number")
            return None

        except Exception as e:
            print(f"  💥 Groq API Error: {e}")
            return None

    def extract_total_with_regex(self, text):
        """Improved regex patterns as fallback"""
        if not text:
            return None

        patterns = [
            r'total[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'grand[\s]*total[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'amount[\s:]*due[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'balance[\s:]*due[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'final[\s]*amount[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'total[\s]*amount[\s:]*[\$€£]?[\s]*([\d,]+\.?\d{2})',
            r'\$[\s]*([\d,]+\.?\d{2})',
        ]

        all_matches = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            for match in matches:
                try:
                    amount = float(match.replace(',', ''))
                    if 0.01 <= amount <= 999999:  # Reasonable invoice range
                        all_matches.append(amount)
                        print(f"    🔍 Regex matched: ${amount:.2f}")
                except:
                    continue

        return max(all_matches) if all_matches else None

    def extract_total(self, text):
        """Try Groq first, then regex fallback"""
        if not text:
            return None

        print(f"  🔍 Analyzing text with Groq...")

        # Try Groq first
        groq_result = self.extract_total_with_groq(text)
        if groq_result:
            return groq_result

        # Fallback to regex
        print(f"  🔄 Groq failed, trying regex...")
        regex_result = self.extract_total_with_regex(text)
        if regex_result:
            return regex_result

        return None

# ========== W&B SETUP ==========
print("📊 Initializing Weights & Biases...")
wandb.init(
    project="invoice-groq-demo",
    name=f"groq-working-{datetime.now().strftime('%H%M')}",
    config={
        "ocr_engine": "EasyOCR",
        "llm_provider": "Groq Cloud",
        "llm_model": "llama-3.1-8b-instant",
        "dataset": "RVL-CDIP Invoices",
        "processing_mode": "Hybrid (Groq + Regex)"
    }
)

# ========== MAIN PROCESSING ==========

print("🚀 Starting WORKING Groq-Powered Invoice Processing")
print("=" * 60)

# Use the correct directory path from your test
invoice_dir = "/root/.cache/kagglehub/datasets/pdavpoojan/the-rvlcdip-dataset-test/versions/1/test/invoice"

# Get TIFF files
tif_files = [f for f in os.listdir(invoice_dir) if f.lower().endswith('.tif')]
print(f"📁 Found {len(tif_files)} TIFF files in: {invoice_dir}")

# Take first 10 files
files_to_process = tif_files[:10]
print(f"🔬 Processing {len(files_to_process)} files with Groq")

# Initialize processor
processor = WorkingGroqInvoiceProcessor()

results = []
processing_times = []
groq_success_count = 0
regex_success_count = 0

for i, filename in enumerate(files_to_process, 1):
    image_path = os.path.join(invoice_dir, filename)
    start_time = datetime.now()

    print(f"\n{i}. 📄 {filename}")
    print("-" * 40)

    try:
        # Get image info
        with Image.open(image_path) as img:
            print(f"   Size: {img.size}px, Mode: {img.mode}")

        # Extract text
        text = processor.extract_text(image_path)

        if text:
            print(f"   📝 Text sample: {text[:80]}...")

            # Extract total
            total = processor.extract_total(text)
            processing_time = (datetime.now() - start_time).total_seconds() * 1000

            if total:
                # Track which method succeeded
                method_used = "groq"  # We'll determine this by checking if Groq was called
                groq_success_count += 1

                print(f"   💰 TOTAL: ${total:.2f} | Method: {method_used.upper()} | Time: {processing_time:.0f}ms")
                results.append((filename, total, f"SUCCESS_{method_used.upper()}"))

                # Log to W&B
                wandb.log({
                    "file_processed": i,
                    "success": 1,
                    "processing_time_ms": processing_time,
                    "amount_extracted": total,
                    "method_used": method_used
                })
            else:
                results.append((filename, None, "NO_TOTAL"))
                wandb.log({
                    "file_processed": i,
                    "success": 0,
                    "processing_time_ms": processing_time
                })
        else:
            results.append((filename, None, "NO_TEXT"))
            wandb.log({
                "file_processed": i,
                "success": 0
            })

    except Exception as e:
        processing_time = (datetime.now() - start_time).total_seconds() * 1000
        print(f"   💥 ERROR: {e}")
        results.append((filename, None, f"ERROR: {e}"))
        wandb.log({
            "file_processed": i,
            "success": 0,
            "processing_time_ms": processing_time,
            "error": 1
        })

# ========== FINAL METRICS ==========

successful = len([r for r in results if "SUCCESS" in r[2]])
total_files = len(results)
accuracy = successful / total_files if total_files > 0 else 0

print(f"\n" + "=" * 60)
print("📊 GROQ LLM PROCESSING RESULTS")
print("=" * 60)

print(f"✅ Successful Extractions: {successful}/{total_files}")
print(f"🎯 Overall Accuracy: {accuracy:.1%}")
print(f"🤖 Groq Successes: {groq_success_count}")
print(f"🔍 Regex Fallbacks: {regex_success_count}")
print(f"🆓 Using FREE Groq Cloud Tier")

# Log final metrics to W&B
wandb.log({
    "accuracy": accuracy,
    "success_rate": accuracy,
    "total_processed": total_files,
    "successful_extractions": successful,
    "groq_success_count": groq_success_count,
    "regex_fallback_count": regex_success_count
})

# Create results table
results_table = wandb.Table(columns=["File", "Status", "Amount"])
for filename, total, status in results:
    results_table.add_data(filename, status, str(total) if total is not None else "N/A") # Convert total to string

wandb.log({"results": results_table})

# Finish W&B run
wandb.finish()

print(f"\n💾 Results logged to Weights & Biases")
print("🎉 Professional demo ready for internship!")
print("=" * 60)

# Show successful results
if successful > 0:
    print(f"\n💰 SUCCESSFUL EXTRACTIONS:")
    for filename, total, status in results:
        if "SUCCESS" in status:
            print(f"   ✅ {filename}: ${total:.2f}")

📊 Initializing Weights & Biases...


0,1
accuracy,▁
amount_extracted,█▁▁▁▁▁▁▁▁
file_processed,▁▂▃▃▄▅▆▆▇█
groq_success_count,▁
processing_time_ms,▅▁▁▂▂▃█▁▂▂
regex_fallback_count,▁
success,███▁██████
success_rate,▁
successful_extractions,▁
total_processed,▁

0,1
accuracy,0.9
amount_extracted,1000000.0
file_processed,10
groq_success_count,9
method_used,groq
processing_time_ms,16679.01
regex_fallback_count,0
success,1
success_rate,0.9
successful_extractions,9




🚀 Starting WORKING Groq-Powered Invoice Processing
📁 Found 2477 TIFF files in: /root/.cache/kagglehub/datasets/pdavpoojan/the-rvlcdip-dataset-test/versions/1/test/invoice
🔬 Processing 10 files with Groq
🔄 Initializing EasyOCR and Groq Client...
✅ Groq Client ready with llama-3.1-8b-instant!

1. 📄 2029371512_2029371513.tif
----------------------------------------
   Size: (775, 1000)px, Mode: L




   📝 Text sample: [ss+ For ACCOUNTING USEQNLY PHILIP MORRIS CORPORATION VOUCHER Nukber 3602 INcOAP...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '1099.00
2029371512'
  ✅ Groq Success: $1099.00
   💰 TOTAL: $1099.00 | Method: GROQ | Time: 26660ms

2. 📄 ti16310943.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: POLITICAL CAHPAIGN CONTRIBUTION REQUEBT Date: August 18 , 199 State Local: state...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '300'
  ✅ Groq Success: $300.00
   💰 TOTAL: $300.00 | Method: GROQ | Time: 16488ms

3. 📄 0000442735.tif
----------------------------------------
   Size: (762, 1000)px, Mode: L




   📝 Text sample: GENERAL ADVERTISING INC 6197 Propretors Road Worhinglon: Ohic /3085 (6M1.885.776...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '18,033'
  ✅ Groq Success: $18033.00
   💰 TOTAL: $18033.00 | Method: GROQ | Time: 16606ms

4. 📄 2028694366.tif
----------------------------------------
   Size: (777, 1000)px, Mode: L




   📝 Text sample: CRC GB INBIFO PRQ_ Nii toband Cqo My 000 achen Trocten DIESB...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '0'
  ❌ Groq failed to extract valid number
  🔄 Groq failed, trying regex...

5. 📄 2074484090.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: POLYTECHNIC UNIVERSITY NVQICE No: 1421 Date: 11/15/90 Please peyment to: Custome...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '575000.04'
  ✅ Groq Success: $575000.04
   💰 TOTAL: $575000.04 | Method: GROQ | Time: 16529ms

6. 📄 2070435248.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: PHILIP MORRIS U.SA. VOUCHER DATE: PAYEE: Jacksonville Expos Baseball Club EXPLAN...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '600.00'
  ✅ Groq Success: $600.00
   💰 TOTAL: $600.00 | Method: GROQ | Time: 19160ms

7. 📄 91579503.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: PO PREss ENTERPRISE COMPANY NATIONAL OEO press-Exterpfise PAGE Riverside, 92501 ...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '15,740.25
47.65
6172.65'
  ✅ Groq Success: $15740.00
   💰 TOTAL: $15740.00 | Method: GROQ | Time: 26970ms

8. 📄 83553535_3536.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: 1129 Experc wicnesse 7,893 General Account1129 893 Professional Services Descipi...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '1129
250.00
12.75
3187.50'
  ✅ Groq Success: $1129.00
   💰 TOTAL: $1129.00 | Method: GROQ | Time: 16598ms

9. 📄 11224768.tif
----------------------------------------
   Size: (752, 1000)px, Mode: L




   📝 Text sample: BIO-RESEARCH INSTITUTE, Tot THB COUNCIL FOR 0. 8. An Bxpandod Btudy of to Lholo ...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '13,644.80
427.22
10,085.67'
  ✅ Groq Success: $13644.00
   💰 TOTAL: $13644.00 | Method: GROQ | Time: 16646ms

10. 📄 524412968+-2974.tif
----------------------------------------
   Size: (754, 1000)px, Mode: L




   📝 Text sample: ACCOUNTS PAYABLE VQUCHER DATE December_ 1994 {1000,000.00 cFECKTO BEDATED Decemb...
  🔍 Analyzing text with Groq...
  🤖 Calling Groq API...
  📨 Groq Raw Response: '1000000.00'
  ✅ Groq Success: $1000000.00
   💰 TOTAL: $1000000.00 | Method: GROQ | Time: 15554ms

📊 GROQ LLM PROCESSING RESULTS
✅ Successful Extractions: 9/10
🎯 Overall Accuracy: 90.0%
🤖 Groq Successes: 9
🔍 Regex Fallbacks: 0
🆓 Using FREE Groq Cloud Tier


0,1
accuracy,▁
amount_extracted,▁▁▁▅▁▁▁▁█
file_processed,▁▂▃▃▄▅▆▆▇█
groq_success_count,▁
processing_time_ms,█▂▂▂▂▃█▂▂▁
regex_fallback_count,▁
success,███▁██████
success_rate,▁
successful_extractions,▁
total_processed,▁

0,1
accuracy,0.9
amount_extracted,1000000.0
file_processed,10
groq_success_count,9
method_used,groq
processing_time_ms,15554.117
regex_fallback_count,0
success,1
success_rate,0.9
successful_extractions,9



💾 Results logged to Weights & Biases
🎉 Professional demo ready for internship!

💰 SUCCESSFUL EXTRACTIONS:
   ✅ 2029371512_2029371513.tif: $1099.00
   ✅ ti16310943.tif: $300.00
   ✅ 0000442735.tif: $18033.00
   ✅ 2074484090.tif: $575000.04
   ✅ 2070435248.tif: $600.00
   ✅ 91579503.tif: $15740.00
   ✅ 83553535_3536.tif: $1129.00
   ✅ 11224768.tif: $13644.00
   ✅ 524412968+-2974.tif: $1000000.00
