In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import re
import json
import torch

def test():
    
    def extract_json_from_text(text):
        """Extract JSON string from model output using regex"""
        # Try to find JSON within code blocks first
        json_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
        if json_match:
            return json_match.group(1)
        
        # If no code blocks, try to find the first JSON object
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            return json_match.group(0)
        
        return None

    # Initialize the model and tokenizer
    model_name = "mistralai/Mistral-7B-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    #   # Example invoice text
    invoice_text = """
      Invoice No: INV-12345
      Date: 2023-10-05
      Due Date: 2023-11-05
      Bill To: John Doe, 123 Main St, Cityville
      Item: Consulting Services, Quantity: 5, Rate: $100.00, Total: $500.00
      Tax: $50.00, Grand Total: $550.00
      """
   
    

    # Create a more specific prompt
    prompt = f"""
    Convert the following invoice text into a valid JSON object with exactly these keys: 
    invoice_number, date, due_date, bill_to, items, tax, total.
    Items should be a list of dictionaries with keys: description, quantity, unit_price, amount.

    Return ONLY the JSON object without any additional text or explanation.

    Invoice Text:
    {invoice_text}
    """

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )

    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the new generated part (remove the prompt)
    response = generated_text[len(prompt):].strip()

    # Try to extract JSON from the response
    json_str = extract_json_from_text(response)

    if json_str:
        try:
            # Parse the JSON
            invoice_data = json.loads(json_str)
            print("Successfully extracted JSON:")
            print(json.dumps(invoice_data, indent=2))
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            print("Model output was:")
            print(response)
    else:
        print("No JSON found in response:")
        print(response)

    
    # Define the output file name for the text file
    output_file = "receipt_output.txt"

        # Open a new file in write mode
    with open(output_file, "w") as f:
            # Write the raw output text to the file
            # The [0] is because tokenizer.batch_decode returns a list
            # of one string, so we want to get the first and only element.
        f.write(json_str)
        
    print(f"Successfully saved output to {output_file}")





if __name__ == "__main__":
   testing = test()
   print(testing)


output:

```
  {
  "invoice_number": "INV-12345",
  "date": "2023-10-05",
  "due_date": "2023-11-05",
  "bill_to": {
    "name": "John Doe",
    "address": "123 Main St, Cityville"
  },
  "items": [
    {
      "description": "Consulting Services",
      "quantity": 5,
      "unit_price": 100.0,
      "amount": 500.0
    }
  ],
  "tax": 50.0,
  "total": 550.0
}
```

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import re
import json
import torch
import urllib.request 
import numpy as np
from PIL import Image
from paddleocr import PaddleOCR
from ast import  literal_eval


def test():
  

    
    def extract_json_from_text(text):
        """Extract JSON string from model output using regex"""
        # Try to find JSON within code blocks first
        json_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
        if json_match:
            return json_match.group(1)
        
        # If no code blocks, try to find the first JSON object
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            return json_match.group(0)
        
        return None

    # Initialize the model and tokenizer
    model_name = "mistralai/Mistral-7B-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    

    # pull a random receipt image from internet
    image_url='https://groups.google.com/group/jzebra-users/attach/d16dbba8a612edfa/Bill%20Image_Receipt.png?part=0.1'
    local_image_id='bill_image_receipt.png'
    urllib.request.urlretrieve(image_url,local_image_id) 
    receipt_image = Image.open(local_image_id)
    receipt_image_array = np.array(receipt_image.convert('RGB'))

    ocr = PaddleOCR(lang="en",
                ocr_version="PP-OCRv4")

    def paddle_scan(paddleocr, 
                    img_path_or_nparray):

        result = ocr.predict(img_path_or_nparray)
        result = result[0]
        txts = result['rec_texts']
        
        return txts, result


    # perform ocr scan
    receipt_texts, receipt_boxes = paddle_scan(ocr,receipt_image_array)
    
        

    # --------------------

    # Create a more specific prompt
    prompt = f"""
    Convert the following invoice text into a valid JSON object with exactly these keys: 
    invoice_number, date, due_date, bill_to, items, tax, total.
    Items should be a list of dictionaries with keys: description, quantity, unit_price, amount.

    Return ONLY the JSON object without any additional text or explanation.

    Invoice Text:
    {receipt_texts}
    """

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )

    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the new generated part (remove the prompt)
    response = generated_text[len(prompt):].strip()

    # Try to extract JSON from the response
    json_str = extract_json_from_text(response)

    if json_str:
        try:
            # Parse the JSON
            invoice_data = json.loads(json_str)
            print("Successfully extracted JSON:")
            print(json.dumps(invoice_data, indent=2))
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            print("Model output was:")
            print(response)
    else:
        print("No JSON found in response:")
        print(response)


    
    # Define the output file name for the text file
    output_file = "receipt_output.txt"

        # Open a new file in write mode
    with open(output_file, "w") as f:
            # Write the raw output text to the file
            # The [0] is because tokenizer.batch_decode returns a list
            # of one string, so we want to get the first and only element.
        f.write(generated_text)
        
    print(f"Successfully saved output to {output_file}")


if __name__ == "__main__":
   testing = test()
   print(testing)


output: 
```
    Successfully extracted JSON:
{

    {  "description": "Heineken Draft Standard",
      "quantity": "3",
      "unit_price": "24.60",
      "amount": "73.80"
    },
    {
      "description": "Heineken Draft Half Liter",
      "quantity": "1",
      "unit_price": "15.20",
      "amount": "15.20"
    },
    {
      "description": "Carlsberg Bucket (5 bottles)",
      "quantity": "2",
      "unit_price": "80.00",
      "amount": "160.00"
    },
    {
      "description": "Grilled Chicken Breast",
      "quantity": "4",
      "unit_price": "74.00",
      "amount": "296.00"
    },
    {
      "description": "Sirloin Steak",
      "quantity": "3",
      "unit_price": "96.00",
      "amount": "288.00"
    },
    {
      "description": "Coke",
      "quantity": "1",
      "unit_price": "3.50",
      "amount": "3.50"
    },
    {
      "description": "Ice Cream",
      "quantity": "5",
      "unit_price": "18.00",
      "amount": "90.00"
    }
  ],
  "tax": "16.36",
  "total"}
      "unit_price": "24.60",
      "amount": "73.80"
    },
    {
      "description": "Heineken Draft Half Liter",
      "quantity": "1",
      "unit_price": "15.20",
      "amount": "15.20"
    },
    {
      "description": "Carlsberg Bucket (5 bottles)",
      "quantity": "2",
      "unit_price": "80.00",
      "amount": "160.00"
    },
    {
      "description": "Grilled Chicken Breast",
      "quantity": "4",
      "unit_price": "74.00",
      "amount": "296.00"
    },
    {
      "description": "Sirloin Steak",
      "quantity": "3",
      "unit_price": "96.00",
      "amount": "288.00"
    },
    {
      "description": "Coke",
      "quantity": "1",
      "unit_price": "3.50",
      "amount": "3.50"
    },
    {
      "description": "Ice Cream",
      "quantity": "5",
      "unit_price": "18.00",
      "amount": "90.00"
    }
  ],
  "tax": "16.36",
  "total": "376.40"
}
```