In [1]:
!pip install easyocr
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [3]:
import re
import easyocr
import requests
from PIL import Image
from io import BytesIO
import cv2
import numpy as np

# List of voltage unit variations
voltage_units = [
    'v', 'volt', 'volts', 'v.', 'vol', 'voltage', 'voltages'
]

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=True)  # Using GPU if available

# Function to download and open an image
def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an error for HTTP errors
        return Image.open(BytesIO(response.content))
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image from {url}. Error: {e}")
        return None

# Function to extract the first number and its corresponding voltage unit from OCR text
def extract_first_number_with_unit(ocr_result):
    # Combine OCR results into a single string
    ocr_text = ' '.join(ocr_result)

    # Normalize numbers by removing spaces within numbers (e.g., '1,500' should remain '1,500')
    ocr_text = re.sub(r'(\d),(\d)', r'\1\2', ocr_text)  # Remove commas in numbers

    # Use regex to find all occurrences of numbers (including those with commas) and following words
    matches = re.findall(r'(\d+\.?\d*|\d{1,3}(?:,\d{3})*)\s*(\w+)?', ocr_text)

    for number, word in matches:
        # Check if the following word is a recognized voltage unit
        if word and word.lower() in voltage_units:
            return number.replace(',', ''), word.lower()  # Remove commas for the numeric value

    # If no unit matches, return the first detected number
    if matches:
        return matches[0][0].replace(',', ''), None  # Remove commas for the numeric value

    # If no numbers are found, return None
    return None, None

# URL of the image
image_url = 'https://m.media-amazon.com/images/I/51Nx8SJbQoL.jpg'

# Download the image
image = download_image(image_url)

if image:
    # Convert PIL image to OpenCV format for EasyOCR
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Perform OCR using EasyOCR
    ocr_result = reader.readtext(open_cv_image, detail=0)

    # Extract the first number with its corresponding unit (if any)
    number, unit = extract_first_number_with_unit(ocr_result)

    if unit:
        print(f"First number with matching voltage unit: {number} {unit}")
    elif number:
        print(f"First number without matching unit: {number}")
    else:
        print("No number found.")
else:
    print("Failed to process the image.")


  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


First number with matching voltage unit: 12 v


In [4]:
import re
import easyocr
import requests
from PIL import Image
from io import BytesIO
import cv2
import numpy as np

# List of wattage unit variations
wattage_units = [
    'w', 'watt', 'watts', 'w.', 'wtt', 'wtts', 'kw', 'kilowatt', 'kilowatts', 'mw', 'megawatt', 'megawatts'
]

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=True)  # Using GPU if available

# Function to download and open an image
def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an error for HTTP errors
        return Image.open(BytesIO(response.content))
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image from {url}. Error: {e}")
        return None

# Function to extract the first number and its corresponding wattage unit from OCR text
def extract_first_number_with_wattage_unit(ocr_result):
    # Combine OCR results into a single string
    ocr_text = ' '.join(ocr_result)

    # Normalize numbers by removing spaces within numbers (e.g., '1,500' should remain '1,500')
    ocr_text = re.sub(r'(\d),(\d)', r'\1\2', ocr_text)  # Remove commas in numbers

    # Use regex to find all occurrences of numbers (including those with commas) and following words
    matches = re.findall(r'(\d+\.?\d*|\d{1,3}(?:,\d{3})*)\s*(\w+)?', ocr_text)

    for number, word in matches:
        # Check if the following word is a recognized wattage unit
        if word and word.lower() in wattage_units:
            return number.replace(',', ''), word.lower()  # Remove commas for the numeric value

    # If no wattage unit matches, return the first detected number
    if matches:
        return matches[0][0].replace(',', ''), None  # Remove commas for the numeric value

    # If no numbers are found, return None
    return None, None

# URL of the image
image_url = 'https://m.media-amazon.com/images/I/61o2ntPNNgL.jpg'

# Download the image
image = download_image(image_url)

if image:
    # Convert PIL image to OpenCV format for EasyOCR
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Perform OCR using EasyOCR
    ocr_result = reader.readtext(open_cv_image, detail=0)

    # Extract the first number with its corresponding wattage unit (if any)
    number, unit = extract_first_number_with_wattage_unit(ocr_result)

    if unit:
        print(f"First number with matching wattage unit: {number} {unit}")
    elif number:
        print(f"First number without matching unit: {number}")
    else:
        print("No number found.")
else:
    print("Failed to process the image.")




First number with matching wattage unit: 800 w


In [8]:
import re
import easyocr
import requests
from PIL import Image
from io import BytesIO
import cv2
import numpy as np

# List of unit variations for height, width, and depth
dimension_units = [
    'cm', 'mm', 'inch', 'inches', 'm', 'meter', 'meters', 'ft', 'foot', 'feet','"', "'"
]

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=True)  # Using GPU if available

# Function to download and open an image
def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an error for HTTP errors
        return Image.open(BytesIO(response.content))
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image from {url}. Error: {e}")
        return None

# Function to extract the first number and its corresponding dimension unit from OCR text
def extract_first_number_with_dimension_unit(ocr_result):
    # Combine OCR results into a single string
    ocr_text = ' '.join(ocr_result)

    # Normalize numbers by removing spaces within numbers (e.g., '1,500' should remain '1,500')
    ocr_text = re.sub(r'(\d),(\d)', r'\1\2', ocr_text)  # Remove commas in numbers

    # Use regex to find all occurrences of numbers (including those with commas) and following words
    matches = re.findall(r'(\d+\.?\d*|\d{1,3}(?:,\d{3})*)\s*(\w+)?', ocr_text)

    for number, word in matches:
        # Check if the following word is a recognized dimension unit
        if word and word.lower() in dimension_units:
            return number.replace(',', ''), word.lower()  # Remove commas for the numeric value

    # If no dimension unit matches, return the first detected number
    if matches:
        return matches[0][0].replace(',', ''), None  # Remove commas for the numeric value

    # If no numbers are found, return None
    return None, None

# URL of the image
image_url = 'https://m.media-amazon.com/images/I/61EfeeDuW2L.jpg'

# Download the image
image = download_image(image_url)

if image:
    # Convert PIL image to OpenCV format for EasyOCR
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Perform OCR using EasyOCR
    ocr_result = reader.readtext(open_cv_image, detail=0)

    # Extract the first number with its corresponding dimension unit (if any)
    number, unit = extract_first_number_with_dimension_unit(ocr_result)

    if unit:
        print(f"First number with matching dimension unit: {number} {unit}")
    elif number:
        print(f"First number without matching unit: {number}")
    else:
        print("No number found.")
else:
    print("Failed to process the image.")




First number without matching unit: 1


In [10]:
import re
import easyocr
import requests
from PIL import Image
from io import BytesIO
import cv2
import numpy as np

# List of unit variations for item volume
volume_units = [
    'ml', 'milliliter', 'millilitre', 'liters', 'litres', 'liter', 'litre',
    'gallon', 'gallons', 'cu cm', 'cubic centimeter', 'cubic centimetre',
    'cu m', 'cubic meter', 'cubic metre', 'cu inch', 'cubic inch', 'cu ft',
    'cubic foot', 'cubic feet', 'fl oz', 'fluid ounce', 'fluid ounces'
]

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'], gpu=True)  # Using GPU if available

# Function to download and open an image
def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an error for HTTP errors
        return Image.open(BytesIO(response.content))
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image from {url}. Error: {e}")
        return None

# Function to extract the first number and its corresponding volume unit from OCR text
def extract_first_number_with_volume_unit(ocr_result):
    # Combine OCR results into a single string
    ocr_text = ' '.join(ocr_result)

    # Normalize numbers by removing spaces within numbers (e.g., '1,500' should remain '1,500')
    ocr_text = re.sub(r'(\d),(\d)', r'\1\2', ocr_text)  # Remove commas in numbers

    # Use regex to find all occurrences of numbers (including those with commas) and following words
    matches = re.findall(r'(\d+\.?\d*|\d{1,3}(?:,\d{3})*)\s*(\w+)?', ocr_text)

    for number, word in matches:
        # Check if the following word is a recognized volume unit
        if word and word.lower() in volume_units:
            return number.replace(',', ''), word.lower()  # Remove commas for the numeric value

    # If no volume unit matches, return the first detected number
    if matches:
        return matches[0][0].replace(',', ''), None  # Remove commas for the numeric value

    # If no numbers are found, return None
    return None, None

# URL of the image
image_url = 'https://m.media-amazon.com/images/I/61nLdfZCTxL.jpg'

# Download the image
image = download_image(image_url)

if image:
    # Convert PIL image to OpenCV format for EasyOCR
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Perform OCR using EasyOCR
    ocr_result = reader.readtext(open_cv_image, detail=0)

    # Extract the first number with its corresponding volume unit (if any)
    number, unit = extract_first_number_with_volume_unit(ocr_result)

    if unit:
        print(f"First number with matching volume unit: {number} {unit}")
    elif number:
        print(f"First number without matching unit: {number}")
    else:
        print("No number found.")
else:
    print("Failed to process the image.")




First number without matching unit: 24.12
