## importing libraries

In [1]:
import os
from google.cloud import vision
import spacy
from IPython.display import display, HTML
from collections import defaultdict
import pandas as pd

## Extracting Text

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='vision_key.json' #load the vision_key.json file

In [3]:
client = vision.ImageAnnotatorClient()

In [4]:
def detect_text(image_path):
    with open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    texts=response.text_annotations

    if response.error.message:
        raise Exception(response.error.message)

    if texts:
        full_text=texts[0].description
        return full_text
    else:
        return ""

In [5]:
image_path='bingo_f.jpg'
result=""
result += detect_text(image_path)+"\n"
result+="\n"
image_path='bingo_b.jpg'
result+= detect_text(image_path)

In [6]:
# print(result)

## Loading Custom Trained NER Model

In [7]:
nlp_ner = spacy.load("model-best")

In [8]:
doc = nlp_ner(result)

In [9]:
import re
#doc = nlp(result)
data = [{ent.label_: ent.text} for ent in doc.ents]
# spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [10]:
data

[{'BRAND_NAME': 'BINGO!\nMAD\nANGLES\nMmmmm\nMasala\nNAMKEEN\n5G+\n\n12:19\nM\nVOD\n50+\nR\nLTE\nPERFECT\nTRIANGULAR\nSHAPE\nLIP-SMACKING\nTASTE\nSCAN TO\nEXPERIENCE\nTHE MADNESS OF\nMAD ANGLES'},
 {'PRICE': '20.00 incl. of all taxes\nFOR FEEDBACK/COMPLAINT CONTACT:\nITC CARES AT P.O. BOX NO. 592,\nBENGALURU-560005.\nitccares@itc.in\n1800 425 444 444'},
 {'LIC_NUMBER': '10012012000154'},
 {'LIC_NUMBER': '10018042004042'},
 {'LIC_NUMBER': '10017031002186'},
 {'HELPLINE_NUMBER': 'SPICES AND CONDIMENTS'},
 {'INGREDIENTS': 'MILK SOLIDS'},
 {'NUTRITIONAL_INFO': 'Sodium (mg)'},
 {'LIC_NUMBER': '10012031000312'},
 {'WEIGHT': '66g'}]

## Will only print the output

In [11]:
# # HTML color mappings
# COLORS_HTML = {
#     'BRAND_NAME': 'red',
#     'NUTRITIONAL_INFO': 'green',
#     'PRODUCT_NAME': 'blue',
#     'WEIGHT': 'orange',
#     'INGREDIENTS': 'purple',
#     'FOOD_TYPE': 'gold',
#     'PRICE': 'silver',
#     'EXPIRY_DATE': 'brown',
#     'MANUFACTURING_DATE': 'teal',
#     'LIC_NUMBER': 'orchid',
#     'HELPLINE_NUMBER': 'pink',
#     'OTHER_INFO': 'chocolate',
# }

# # Initialize a dictionary with default values as sets to avoid duplicates
# grouped_data = defaultdict(set)

# # Iterate over the extracted entities and group them by their labels
# for ent in doc.ents:
#     grouped_data[ent.label_].add(ent.text)

# # Function to get the longest string in a set
# def get_longest_string(strings):
#     return max(strings, key=len)

# # Process 'BRAND_NAME' and 'PRODUCT_NAME' to keep only the longest string
# for label in ['BRAND_NAME', 'PRODUCT_NAME', 'LIC_NUMBER', 'HELPLINE_NUMBER']:
#     if label in grouped_data:
#         # Keep the longest string and remove the rest
#         longest_string = get_longest_string(grouped_data[label])
#         grouped_data[label] = {longest_string}

# # Convert sets to lists for easy display
# grouped_data = {key: list(value) for key, value in grouped_data.items()}

# # Function to display entities with colored labels in HTML
# def display_grouped_data_html(grouped_data):
#     html_output = ""
#     for label, values in grouped_data.items():
#         color = COLORS_HTML.get(label, 'black')  # Default to black if no color defined
#         html_output += f"<b style='color:{color};'>{label}</b>: {values}<br><br>"
#     display(HTML(html_output))

# # Display the output in HTML format
# display_grouped_data_html(grouped_data)

## Will only append the output in excel file(data.xlsx)

In [12]:
# # Convert sets to lists for easy display (if not already done)
# grouped_data = {key: list(value) for key, value in grouped_data.items()}

# # Convert grouped data into a DataFrame
# df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in grouped_data.items()]))

# # Define the Excel file path
# excel_file = 'data.xlsx'

# # Try to read existing data, append the new data, and write back
# try:
#     # Read the existing data
#     existing_df = pd.read_excel(excel_file, sheet_name='Grouped_Data')
#     # Concatenate the new data with the existing data
#     combined_df = pd.concat([existing_df, df], ignore_index=True)
# except FileNotFoundError:
#     # If the file doesn't exist, use the new DataFrame directly
#     combined_df = df

# # Write the combined DataFrame back to the Excel file
# with pd.ExcelWriter(excel_file, mode='w') as writer:
#     combined_df.to_excel(writer, sheet_name='Grouped_Data', index=False)

# print(f"Data has been appended to {excel_file}")

## Will both display and append the output

In [13]:
# HTML color mappings
COLORS_HTML = {
    'BRAND_NAME': 'red',
    'NUTRITIONAL_INFO': 'green',
    'PRODUCT_NAME': 'blue',
    'WEIGHT': 'orange',
    'INGREDIENTS': 'purple',
    'FOOD_TYPE': 'gold',
    'PRICE': 'silver',
    'EXPIRY_DATE': 'brown',
    'MANUFACTURING_DATE': 'teal',
    'LIC_NUMBER': 'orchid',
    'HELPLINE_NUMBER': 'pink',
    'OTHER_INFO': 'chocolate',
}

# Initialize a dictionary with default values as sets to avoid duplicates
grouped_data = defaultdict(set)

# Iterate over the extracted entities and group them by their labels
for ent in doc.ents:
    grouped_data[ent.label_].add(ent.text)

# Function to get the longest string in a set
def get_longest_string(strings):
    return max(strings, key=len)

# Process 'BRAND_NAME' and 'PRODUCT_NAME' to keep only the longest string
for label in ['BRAND_NAME', 'PRODUCT_NAME', 'LIC_NUMBER', 'HELPLINE_NUMBER']:
    if label in grouped_data:
        # Keep the longest string and remove the rest
        longest_string = get_longest_string(grouped_data[label])
        grouped_data[label] = {longest_string}

# Convert sets to lists for easy display
grouped_data = {key: list(value) for key, value in grouped_data.items()}

# Function to display entities with colored labels in HTML
def display_grouped_data_html(grouped_data):
    html_output = ""
    for label, values in grouped_data.items():
        color = COLORS_HTML.get(label, 'black')  # Default to black if no color defined
        html_output += f"<b style='color:{color};'>{label}</b>: {values}<br><br>"
    display(HTML(html_output))

# Display the output in HTML format
display_grouped_data_html(grouped_data)

# Convert grouped data into a DataFrame
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in grouped_data.items()]))

# Define the Excel file path
excel_file = 'data.xlsx'

# Try to read existing data, append the new data, and write back
try:
    # Read the existing data
    existing_df = pd.read_excel(excel_file, sheet_name='Grouped_Data')
    # Concatenate the new data with the existing data
    combined_df = pd.concat([existing_df, df], ignore_index=True)
except FileNotFoundError:
    # If the file doesn't exist, use the new DataFrame directly
    combined_df = df

# Write the combined DataFrame back to the Excel file
with pd.ExcelWriter(excel_file, mode='w') as writer:
    combined_df.to_excel(writer, sheet_name='Grouped_Data', index=False)

print(f"Data has been appended to {excel_file}")

Data has been appended to data.xlsx
