In [1]:
# !pip uninstall -y tensorflow
# !pip install tensorflow-cpu
# !pip install surya-ocr
# !pip install easyocr


Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting scikit-image (from easyocr)
  Downloading scikit_image-0.24.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp310-none-win_amd64.whl.metadata (4.7 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-win_amd64.whl.metadata (5.4 kB)
Collecting torch (from easyocr)
  Downloading torch-2.0.0-cp310-cp310-win_amd64.whl.metadata (23 kB)
Collecting lazy-loader>=0.4 (from scikit-image->easyocr)
  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
   ---------------------------------------- 2.9/2.9 MB

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
surya-ocr 0.5.0 requires torch<3.0.0,>=2.3.0, but you have torch 2.0.0 which is incompatible.


In [2]:
import os
import pandas as pd
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
import re
import easyocr

In [3]:
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()
reader = easyocr.Reader(['en']) 

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [18]:
results = reader.readtext("Screenshot 2024-05-20 222756.png", detail=0)

In [19]:
results

['Visit WWW talentserve org',
 'SignUp',
 'Register and Sign Up',
 'https: [ [WWw.talentserve org/candidate_signup',
 'Fill the Details',
 'Full Name',
 '2.Email Id ( Valid Email ID)',
 'Mlobile Number',
 'Create',
 'Password',
 '#Referral Code',
 'HRD*',
 'Create Account',
 'Login',
 'Go to Test',
 'Click on',
 'Aptitude Test) and Complete',
 '8. You will get marks',
 'completion',
 'it forvour records',
 'Note the Marks',
 'Go to top Right Side and Click on Profile and See the',
 'Aspirant ID',
 'Note that (It should Start with C6XXXX)',
 'post',
 'keep']

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
# Abbreviation map
unit_abbreviation_map = {
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre', 'in': 'inch', 'ft': 'foot', 'yd': 'yard',
    'g': 'gram', 'kg': 'kilogram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound', 't': 'ton',
    'kV': 'kilovolt', 'V': 'volt', 'mV': 'millivolt',
    'kW': 'kilowatt', 'W': 'watt',
    'cl': 'centilitre', 'ml': 'millilitre', 'l': 'litre', 'fl oz': 'fluid ounce', 'gal': 'gallon',
    'pt': 'pint', 'qt': 'quart', 'cu ft': 'cubic foot', 'cu in': 'cubic inch'
}

# Entity-unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce',
                    'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [21]:
# Function to get all relevant units and abbreviations for an entity
def get_entity_units(entity_name):
    if entity_name not in entity_unit_map:
        return set()

    units = entity_unit_map[entity_name]
    # Find abbreviations that map to these units
    abbreviations = {abbr for abbr, full in unit_abbreviation_map.items() if full in units}
    # Combine full units and their abbreviations
    return units.union(abbreviations)

# Function to extract entity values with abbreviation handling
def extract_entity_values(text, entity_name):
    valid_units = get_entity_units(entity_name)
    if not valid_units:
        return []

    # Build regex pattern to match numbers followed by valid units or their abbreviations
    # Sort by length to handle multi-word units like 'fluid ounce'
    sorted_units = sorted(valid_units, key=lambda x: -len(x))
    pattern = r'(\d+(?:\.\d+)?)\s*(' + '|'.join(map(re.escape, sorted_units)) + r')\b'

    # Find all matches
    matches = re.findall(pattern, text)

    extracted = []
    for value, unit in matches:
        # Convert abbreviation to full unit name if necessary
        full_unit = unit_abbreviation_map.get(unit, unit)
        extracted.append((float(value), full_unit))

    return extracted

# Function to select the appropriate value based on entity rules
def select_entity_value(extracted_values, entity_name):
    if not extracted_values:
        return ""

    # Extract numerical values
    values = [val for val, unit in extracted_values]

    if entity_name == 'height':
        selected = max(values)
    elif entity_name == 'width':
        if len(values) >= 2:
            sorted_vals = sorted(values, reverse=True)
            selected = sorted_vals[1]  # Second maximum
        else:
            selected = max(values)
    elif entity_name == 'depth':
        selected = min(values)
    elif entity_name == 'maximum_weight_recommendation':
        selected = max(values)
    else:
        selected = values[0]  # Default to first value

    # Find the corresponding unit
    for val, unit in extracted_values:
        if val == selected:
            return f"{val} {unit}"

    return ""  # Fallback in case something goes wrong

# Function to process all texts for a specific entity
def process_texts(texts, entity_name):
    all_extracted = []
    for text in texts:
        extracted = extract_entity_values(text, entity_name)
        all_extracted.extend(extracted)

    selected_value = select_entity_value(all_extracted, entity_name)
    return selected_value

In [22]:
# Predictor function integrating OCR and entity extraction
from io import BytesIO
import requests

def predictor(image_link, category_id, entity_name):

    # Extract image name from the link
    image_name = os.path.basename(image_link)

    try:
        # Load the image
        response = requests.get(image_link)

        # Open the image using PIL
        image = Image.open(BytesIO(response.content))
        # Run OCR
        predictions = run_ocr([image], [["en"]], det_model, det_processor, rec_model, rec_processor)
        results = reader.readtext(image, detail=0)
        # Extract texts from OCR predictions
        extracted_texts = [text_line.text for result in predictions for text_line in result.text_lines]
        # Process texts to extract the entity value
        extracted_texts+=results
        prediction = process_texts(extracted_texts, entity_name)
        return prediction

    except FileNotFoundError:
        print(f"Image {image_name} not found in the folder.")
        return None

In [25]:
# Main execution
if __name__ == "__main__":

    # Load test CSV
    output_filename = 'dataset\\t50k-60k.csv'
    test = pd.read_csv('dataset\sample_test.csv')
    test_subset = test.iloc[50000:60000]

# Write headers to the CSV file (to create the file)
    with open(output_filename, mode='w') as f:
        f.write("index,prediction\n")

# Run predictions and append each result to the CSV file
    for index, row in test_subset.iterrows():
        prediction = predictor(row['image_link'], row['group_id'], row['entity_name'])
        with open(output_filename, mode='a') as f:
          f.write(f"{row['index']},{prediction}\n")