In [4]:
import pandas as pd
import pytesseract
import re
from PIL import Image

# Function to extract the value for the given entity from the image text
def extract_value_from_text(text, entity_name):
    text = text.lower()  # Normalize text to lowercase
    
    entity_patterns = {
        'width': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'height': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'depth': r'(\d+(\.\d+)?)\s*(cm|mm|inches|inch)',
        'voltage': r'(\d+(\.\d+)?)\s*(volt|v)',
        'wattage': r'(\d+(\.\d+)?)\s*(watt|kw|w)',
        'item_weight': r'(\d+(\.\d+)?)\s*(kg|g|gram|ton)',
        'maximum_weight_recommendation': r'(\d+(\.\d+)?)\s*(kg|g|gram|ton)'
    }
    
    pattern = entity_patterns.get(entity_name)
    
    if pattern:
        match = re.search(pattern, text)
        if match:
            value = match.group(1)  # Extract the numeric value
            unit = match.group(3)   # Extract the unit
            return f"{value} {unit}"
    
    return ""

# Process a single image to extract the entity value
def process_image(image_path, entity_name):
    try:
        img = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(img)
        return extract_value_from_text(extracted_text, entity_name)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""

# Main function to process the entire DataFrame and predict values
def predict_entity_values(df, image_folder):
    predictions = []

    for index, row in df.iterrows():
        image_path = f"{image_folder}/{row['index']}.jpg"  # Assuming image files are named by index
        entity_name = row['entity_name']
        
        # Process the image and get the prediction
        prediction = process_image(image_path, entity_name)
        
        # Append the result to the list
        predictions.append({
            'index': row['index'],
            'prediction': prediction if prediction else ""  # Return empty string if no prediction
        })
    
    # Convert the predictions list to a DataFrame
    prediction_df = pd.DataFrame(predictions)
    
    # Save to CSV
    prediction_df.to_csv('predictions.csv', index=False)

    return prediction_df

# Example usage
df = pd.read_csv('/Users/kaustubh/Downloads/student_resource 3/dataset/sample_test.csv')  # Load the test dataset
image_folder = '/Users/kaustubh/Downloads/student_resource 3/images'  # Folder where images are downloaded
result_df = predict_entity_values(df, image_folder)
print(result_df.head())


Error processing image /Users/kaustubh/Downloads/student_resource 3/images/0.jpg: [Errno 2] No such file or directory: '/Users/kaustubh/Downloads/student_resource 3/images/0.jpg'
Error processing image /Users/kaustubh/Downloads/student_resource 3/images/1.jpg: [Errno 2] No such file or directory: '/Users/kaustubh/Downloads/student_resource 3/images/1.jpg'
Error processing image /Users/kaustubh/Downloads/student_resource 3/images/2.jpg: [Errno 2] No such file or directory: '/Users/kaustubh/Downloads/student_resource 3/images/2.jpg'
Error processing image /Users/kaustubh/Downloads/student_resource 3/images/3.jpg: [Errno 2] No such file or directory: '/Users/kaustubh/Downloads/student_resource 3/images/3.jpg'
Error processing image /Users/kaustubh/Downloads/student_resource 3/images/4.jpg: [Errno 2] No such file or directory: '/Users/kaustubh/Downloads/student_resource 3/images/4.jpg'
Error processing image /Users/kaustubh/Downloads/student_resource 3/images/5.jpg: [Errno 2] No such file 