In [3]:
from PIL import Image
import pytesseract
import os

# Set the path to the Tesseract executable
# For Windows users, you may need to specify the path explicitly, e.g., r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# For macOS/Linux, you typically don't need to set this if Tesseract is properly installed and in your PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

# Get the image file name
image_file = 'image.png'

# Check if the file exists
if not os.path.isfile(image_file):
    print(f"File {image_file} not found.")
else:
    # Open the image file
    image = Image.open(image_file)

    # Use pytesseract to do OCR on the image
    text = pytesseract.image_to_string(image)

    # Print the extracted text
    print("Extracted Text:")
    print(text)

Extracted Text:
CONTOSO LTD.

Medical Report

Mateo Gomez, 28-year-old man, suffered a car accident driving near his home
‘on Hollywood Boulevard on August 17th, 2022, and was admitted to Contoso
General Hospital in Los Angeles California at 7:45 PM. The patient showed
signs of chest trauma indicating possible rib fracture and had difficulty
breathing. A chest CT scan and AP X-ray were performed to determine the
damage to

1. ribs
2. lungs

Results showed a pseudoaneurysm of the thoracic aorta with minor fractures
to the first and third right ribs. The patient was kept in the ICU where
treatment was initiated. A Stent was surgically placed to stabilize the
hemorrhage until the blood oxygen level reached 95 percent. The patient was
discharged on September 1st, 2022, under the supervision of his caretaker
Nickolaus Schulz, passport number: B12345678.

Report signed off by: Jack John, MD
Date: 8/30/2023



In [5]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and model from the local directory
model_path = r'D:\Projects\SIH\RE-DACT\redact\app\services\deberta_finetuned_pii' # Fix this path thingy according to your conveniece
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create the pipeline with your local model
gen = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first")

def tag_model_output(model_output):
    level_1 = {
        "SSN", "CREDITCARDNUMBER", "CREDITCARDCVV", "PASSWORD", "IP", "MAC",
        "BITCOINADDRESS", "ETHEREUMADDRESS", "LITECOINADDRESS", "ACCOUNTNUMBER",
        "IBAN", "BIC"
    }
    
    level_2 = {
        "FIRSTNAME", "LASTNAME", "FULLNAME", "NAME", "EMAIL", "PHONE_NUMBER",
        "STREETADDRESS", "CITY", "ZIPCODE", "STATE", "COUNTRY", "JOBTITLE",
        "COMPANY_NAME", "USERNAME"
    }
    
    level_3 = {
        "PREFIX", "MIDDLENAME", "SUFFIX", "JOBDESCRIPTOR", "JOBAREA",
        "SECONDARYADDRESS", "COUNTY", "CURRENCY", "CURRENCYSYMBOL",
        "CURRENCYCODE", "USERAGENT", "SEX", "GENDER", "NEARBYGPSCOORDINATE",
        "DISPLAYNAME", "SEXTYPE", "ORDINALDIRECTION"
    }
    
    def tag_word(word):
        if word in level_1:
            return 1
        elif word in level_2:
            return 2
        elif word in level_3:
            return 3
        else:
            return 999  # Default to level 9 for all other items not classified

    for entity in model_output:
        entity['tag'] = tag_word(entity['entity_group'])
    
    return model_output

def classify_and_tag_text(input_text):
    # Returns a dictionary with words as keys and their corresponding tags as values.
    classified_entities = gen(input_text)
    tagged_entities = tag_model_output(classified_entities)
    word_tag_dict = {entity['word'].strip(): entity['tag'] for entity in tagged_entities}
    return word_tag_dict

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def extract_and_classify_text(image_path):
    
    # Opens an image, extracts text using OCR, classifies and tags the text, and returns words with their classification and bounding box coordinates.

    # Args: image_path (str): The path to the image file.

    # Returns: list: A list of dictionaries containing words, their classification tags, and bounding box coordinates.
    
    # Check if the file exists
    if not os.path.isfile(image_path):
        print(f"File {image_path} not found.")
        return []

    # Open the image file
    image = Image.open(image_path)

    # Use pytesseract to do OCR on the image
    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    # Extract text and bounding box coordinates
    words = data['text']
    confidences = data['conf']
    x_coords = data['left']
    y_coords = data['top']
    widths = data['width']
    heights = data['height']

    # Filter out words with low confidence
    filtered_words = [
        {
            'word': word,
            'conf': conf,
            'x': x,
            'y': y,
            'width': width,
            'height': height
        }
        for word, conf, x, y, width, height in zip(words, confidences, x_coords, y_coords, widths, heights)
        if int(conf) > 60  # You can adjust the confidence threshold as needed
    ]

    # Combine the words into a single string for classification
    input_text = ' '.join([word['word'] for word in filtered_words])

    # Classify and tag the text
    word_tag_dict = classify_and_tag_text(input_text)

    # Add classification tags to the filtered words
    for word_info in filtered_words:
        word_info['tag'] = word_tag_dict.get(word_info['word'], 999)  # Default to 999 if not found

    return filtered_words

# Example usage
image_path = 'image.png'
classified_words = extract_and_classify_text(image_path)
print(classified_words)

[{'word': 'CONTOSO', 'conf': 91, 'x': 38, 'y': 42, 'width': 77, 'height': 13, 'tag': 2}, {'word': 'LTD.', 'conf': 81, 'x': 120, 'y': 42, 'width': 31, 'height': 13, 'tag': 999}, {'word': 'Medical', 'conf': 96, 'x': 227, 'y': 88, 'width': 89, 'height': 20, 'tag': 999}, {'word': 'Report', 'conf': 96, 'x': 323, 'y': 88, 'width': 75, 'height': 25, 'tag': 999}, {'word': 'Mateo', 'conf': 96, 'x': 38, 'y': 127, 'width': 44, 'height': 12, 'tag': 2}, {'word': 'Gomez,', 'conf': 96, 'x': 86, 'y': 126, 'width': 54, 'height': 14, 'tag': 2}, {'word': '28-year-old', 'conf': 95, 'x': 144, 'y': 127, 'width': 82, 'height': 15, 'tag': 1}, {'word': 'man,', 'conf': 96, 'x': 230, 'y': 129, 'width': 35, 'height': 11, 'tag': 999}, {'word': 'suffered', 'conf': 93, 'x': 269, 'y': 126, 'width': 59, 'height': 13, 'tag': 999}, {'word': 'a', 'conf': 96, 'x': 332, 'y': 129, 'width': 8, 'height': 10, 'tag': 999}, {'word': 'car', 'conf': 95, 'x': 344, 'y': 129, 'width': 22, 'height': 10, 'tag': 999}, {'word': 'accident

In [12]:
from PIL import ImageDraw

def redact_image(classified_words, level):
    # Make a copy of the image to draw on
    redacted_image = image.copy()
    draw = ImageDraw.Draw(redacted_image)

    # Iterate over the classified words and draw black boxes on words with tag <= level
    for word_info in classified_words:
        if word_info['tag'] <= level:
            x = word_info['x']
            y = word_info['y']
            width = word_info['width']
            height = word_info['height']
            draw.rectangle([(x, y), (x + width, y + height)], fill="black")

    # Save the redacted image
    output_folder = 'outputs'
    os.makedirs(output_folder, exist_ok=True)
    base_name, ext = os.path.splitext(image_file)
    output_path = os.path.join(output_folder, f'{base_name}_redacted_{level}{ext}')
    redacted_image.save(output_path)
    
    return output_path

# Example usage
redacted_image = redact_image(classified_words, 1)



AttributeError: 'str' object has no attribute 'show'