In [None]:
import cv2
import pytesseract
import requests
from PIL import Image
from io import BytesIO
import re
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

# Tesseract command path (change this if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'

# Entity to unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'litre',
                    'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to download the image
def download_image(image_url):
    response = requests.get(image_url)
    if response.status_code == 200:
        img = Image.open(BytesIO(response.content))
        return img
    else:
        raise Exception(f"Failed to download image. Status code: {response.status_code}")

# Function to preprocess the image for OCR
def preprocess_image(image):
    image_np = np.array(image)
    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    img_resized = cv2.resize(thresh, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    return img_resized

# Function to extract text from the image
def extract_text_from_image(image):
    preprocessed_image = Image.fromarray(image)
    text = pytesseract.image_to_string(preprocessed_image)
    return text

# Function to extract numeric values without units from text
def extract_constants(text):
    constants = re.findall(r'\d+\.?\d*', text)
    return constants

# Function to extract entity value based on units from text
def extract_entity_value(entity_name, text):
    # Find numbers followed by a unit
    pattern = r'(\d+\.?\d*)\s*(\w+)'
    matches = re.findall(pattern, text)

    for match in matches:
        value, unit = match
        if unit in entity_unit_map.get(entity_name, set()):
            return f"{value} {unit}"

    # If no match with units, check for standalone numeric values
    constants = extract_constants(text)
    if constants:
        return constants[0]  # Use the first constant without a unit

    return None

# Function to process a single image and extract the entity value
def process_image(image_url, entity_name):
    image = download_image(image_url)
    preprocessed_image = preprocess_image(image)
    text = extract_text_from_image(preprocessed_image)

    entity_value = extract_entity_value(entity_name, text)

    return entity_value

# Function to predict the unit if no valid entity value is found
def predict_unit(group_id_value, entity_name_value, ct, classifier, unit_mapping):
    manual_input = {
        'group_id': [group_id_value],
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],
        'product_id': [None]
    }
    single_row_df = pd.DataFrame(manual_input)
    encoded_row = ct.transform(single_row_df)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)
    predicted_unit_label = classifier.predict(input_data)
    predicted_unit = unit_mapping[predicted_unit_label[0]]
    return predicted_unit

# Main function to process the dataset
def process_dataset(csv_file, ct, classifier, unit_mapping):
    dataset = pd.read_csv(csv_file)
    results = []

    for index, row in dataset.iterrows():
        group_id = row['group_id']
        entity_name = row['entity_name']
        image_url = row['image_link']

        entity_value = process_image(image_url, entity_name)

        if entity_value is None:
            entity_value = predict_unit(group_id, entity_name, ct, classifier, unit_mapping)

        results.append({'group_id': group_id, 'entity_value': entity_value})

    result_df = pd.DataFrame(results)
    result_df.to_csv('output_entity_values.csv', index=False)

# Load trained model, encoder, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Example of running the process
process_dataset('input_file.csv', ct, classifier, unit_mapping)
